In this report you will find all the steps performed to analyze microarray data coming from healthy and cancer affected patients with BRCA1/2 mutated and no-mutated. The array used is the GeneChip Human Genome U133 Plus 2.0.

1 Working directory and libraries

Set working directory and the libraries needed to load the files:

wd<-setwd("C:/Users/MARINA/Documents/MASTER/TFM/Data_analysis")

# wd of the Cel files
cel.wd<-'../Initial_analysis/CEl_files'

library(readxl)
library(affy)
library(limma)
library(glmnet)
library(immunedeconv)
library(tidyverse)
library(ggplot2)
library(DT)

2 Load and prepare the data

Load the file that contains the phenodata (treatment, mutation, Disease …) and the CEL files. Then, check that the phenodata and CEL files are in the same order.

#Load pheno data
pdata<-read.csv2('../Initial_analysis/phenodata.csv',header=TRUE)

#change the wd to where the cel files are
setwd(file.path(wd,cel.wd))

# get eligible data ("CEL" files)
celFiles = list.files(file.path(wd,cel.wd), pattern = "CEL")

# --- Pdata and cel files have to be in the same order ---
pdata$ID<-gsub(' ','_',pdata$ID)
order<-pdata$ID
factor_celfiles<-factor(celFiles,levels = order)
celFiles_ordered<-celFiles[order(factor_celfiles)]

#check the order
identical(celFiles_ordered,pdata$ID)#same order
## [1] TRUE
head(cbind(celFiles_ordered, pdata))
##   celFiles_ordered        ID Treatment Mutation Disease Phenotype    ShortName
## 1        SG-17.CEL SG-17.CEL       NOR    BRCA2      SA  BRCA2.SA BRCA2.SA.NOR
## 2        SG-18.CEL SG-18.CEL       RAD    BRCA2      SA  BRCA2.SA BRCA2.SA.RAD
## 3        SG-19.CEL SG-19.CEL       NOR    BRCA2      SA  BRCA2.SA BRCA2.SA.NOR
## 4        SG-20.CEL SG-20.CEL       RAD    BRCA2      SA  BRCA2.SA BRCA2.SA.RAD
## 5        SG-21.CEL SG-21.CEL       NOR    NOMUT      SA  NOMUT.SA NOMUT.SA.NOR
## 6        SG-22.CEL SG-22.CEL       RAD    NOMUT      SA  NOMUT.SA NOMUT.SA.RAD
##   Sample Cancer
## 1     17     NO
## 2     18     NO
## 3     19     NO
## 4     20     NO
## 5     21     NO
## 6     22     NO

2.1 Read CEL files

Once the data is loaded, we will read the CEL files with the ReadAffy() function.

# change row names of pdata
rownames(pdata)<-pdata$ID
pdata<-pdata[,-1]

# Read CEL files 
setwd(file.path(wd,cel.wd))
brca_data <- ReadAffy(filenames = as.vector(rownames(pdata)), phenoData = pdata)

brca_data
## 
## AffyBatch object
## size of arrays=1164x1164 features (57 kb)
## cdf=HG-U133_Plus_2 (54675 affyids)
## number of samples=106
## number of genes=54675
## annotation=hgu133plus2
## notes=

Brca_data is an AffyBatch Object that contains all 106 samples and 54675 probe sets.

2.2 Data exploration

We will explore the data to help clarify what we have now.

# --- matrix of intensities for each probe ---
head(exprs(brca_data))[,1:6]
##   SG-17.CEL SG-18.CEL SG-19.CEL SG-20.CEL SG-21.CEL SG-22.CEL
## 1       142       105        69        75        55        60
## 2      5140      4553      3755      4013      4727      3307
## 3       118       120        70        74       104        77
## 4      5696      4599      3908      4310      4680      3562
## 5        78        53        46        53        52        39
## 6        63        65        62        70        59        52
dim(exprs(brca_data))
## [1] 1354896     106

The matrix of intensities contains raw signal, not normalized. We have 106 samples and 1354896 probes. Now let’s get information regarding the phenodata.

The following tables will summarize the number of individuals for each condition:

# --- Get phenoData ---
head(pData(brca_data))
##           Treatment Mutation Disease Phenotype    ShortName Sample Cancer
## SG-17.CEL       NOR    BRCA2      SA  BRCA2.SA BRCA2.SA.NOR     17     NO
## SG-18.CEL       RAD    BRCA2      SA  BRCA2.SA BRCA2.SA.RAD     18     NO
## SG-19.CEL       NOR    BRCA2      SA  BRCA2.SA BRCA2.SA.NOR     19     NO
## SG-20.CEL       RAD    BRCA2      SA  BRCA2.SA BRCA2.SA.RAD     20     NO
## SG-21.CEL       NOR    NOMUT      SA  NOMUT.SA NOMUT.SA.NOR     21     NO
## SG-22.CEL       RAD    NOMUT      SA  NOMUT.SA NOMUT.SA.RAD     22     NO
# Number of healthy and Afected samples
table(pData(brca_data)$Disease)
## 
## AF SA 
## 52 54
# Number of samples for each phenotype
table(pData(brca_data)$Phenotype)
## 
## BRCA1.AF BRCA1.SA BRCA2.AF BRCA2.SA NOMUT.AF NOMUT.SA 
##       20       16       22       18       10       20
# Number of samples non-radiated and radiated
table(pData(brca_data)$Treatment)
## 
## NOR RAD 
##  53  53

3 Quality control

In the quality control step, we will check the distributions of the data to detect possible outliers and remove them from the study.

3.1 Image assessment

We first start checking the images that are generated by the scan. The intensities are measuring the amount of expression in each probe.

# --- array images ----
#image(brca_data)

The images of the scan were generated for all the 106 samples, but just the image for sample 17 is showed. The rest of the images can be found in the Supplementary section.

In SG-18.CEL, SG-107.CEL and SG-121.CEL there are little marks or spots, but they were not big enough to remove the arrays.

3.2 Data quality plots

Now we will check the quality of the data by producing some intensity density plots, intensity box plots, MAplots and RLE and NUSE box plots.

# BOXPLOT
colors<-palette('Set2')
boxplot(brca_data,col=colors,las=3,cex.axis=0.3, main='Boxplot of intensities',ylab='Intensity level',xlab='samples') 

In the boxplot we see that all the samples intensity medians are quite in the same range. Obviously there are samples with higher expression and other with lower, but it is fine to see heterogeneity. No outliers are detected. The boxplots of the three samples SG-18.CEL, SG-107.CEL and SG-121.CEL seem to have a normal distribution such as other samples.

# HISTOGRAM

hist(brca_data,col=colors, main='Histogram of intensities',ylab='Density',xlab='Log itensity')

This histogram shows the distribution of the intensities. Sample SG-81.CEL seems to have a little bit of more intensities than the others, but it is not considered an outlier.

# MA plot
# MAplot(brca_data) #each array against a pseudo-median of all other arrays

In this MA plot, we are comparing the sample 17 with the reference, and we are measuring the differences in the intensity of the probes. As the red line is above the blue, that means that this sample has higher intensity than the median. The other plots can be found in the Supplementary section.

# ----- RLE plots --------
library(affyPLM)
fitmodel<-fitPLM(brca_data) #fit a model
RLE(fitmodel,las=3,cex.axis=0.3,col=colors)

Again, we study the probes while comparing them to the other probes in the rest of the arrays. We compute the expression of one probe and compare to the expression media of the same probes in the other arrays. Once we have fit the model, the RLE plot is generated and all the medians are close to 0, that is what we expect.

# ------- NUSE plots -----------
NUSE(fitmodel,las=3,cex.axis=0.3,col=colors,ylim=c(0.95,1.1))

We also study the Normalized Unscaled Standard Errors (NUSE plots), in which we expect to see the median close to 1 for all the samples. There is a little bit of dispersion, but all the samples are really close to 1.

In conclusion, there is a good quality of the samples and no array is removed from the study.

4 Normalization

After the quality control, we need to perform a normalization step, applying the Robust Multichip Average (RMA) approach.

# Normalization
brca.rma <- affy::rma(brca_data)#Normalization
## Background correcting
## Normalizing
## Calculating Expression
dim(brca.rma)
## Features  Samples 
##    54675      106

After normalization we have 54675 summarized probesets.

# Boxplot of intensities
boxplot(exprs(brca.rma),las=3,cex.axis=0.3,outline=FALSE) 
abline(h=median(exprs(brca.rma)),col="blue")

The boxplot on the normalized intensities is performed and the distribution is similar and homogeneous in all of the samples.

5 Sample aggregation

To see how samples aggregate, hierarchical clustering and PCA is performed. The purpose is to see if samples aggregate by their condition, or they don’t. We will see if samples aggregate by disease (Cancer affected versus healthy), and by phenotype (BRCA1.AF/BRCA1.SA/BRCA2.AF/BRCA2.SA/NOMUT.SA/NOMUT/AF).

library(dendextend)

# --- Get matrix of intensities ---
expression_all<-exprs(brca.rma)

We will use different methods to check if the clusterization is the same or similar.

5.1 Sample aggregation by disease

## --- Euclidean distance,method ward.D2 ---
hcwd2_all <- hclust(dist(t(expression_all)),method="ward.D2")

# Build dendrogram object from hclust results
dend_wd2_all <- as.dendrogram(hcwd2_all)
order_wd2_all<-unlist(dend_wd2_all)

## sample aggregation by disease
condition<-c(pData(brca.rma)$Disease)
condition<-condition[order_wd2_all] 
colors<-palette('Dark2') # set a color palette
names(colors)<-levels(factor(condition))
condition_colors <- colors[condition]

dend_wd2_all %>% set("labels_col", condition_colors) %>% 
  set("labels_cex", 0.7) %>% # Change size
  plot(main = "Dendogram") # plot
legend("topright", legend = unique(condition),fill = colors)

## --- Correlation based distance, average method ---
clust.cor.average_all<- hclust(as.dist(1-cor(expression_all)),method="average")
dendcbd_all<-as.dendrogram(clust.cor.average_all)
order_cbd_all<-unlist(dendcbd_all)
condition<-c(pData(brca.rma)$Disease)
condition<-condition[order_cbd_all]
condition<-relevel(factor(condition),'SA')
condition_colors2 <- colors[as.numeric(factor(condition))]

dendcbd_all %>% set("labels_col", condition_colors2) %>% # change color
  set("labels_cex", 0.7) %>% # Change size
  plot(main = "Dendrogram") # plot
legend("topright", legend = unique(condition),fill = colors)

Samples do not aggregate well by disease condition.

5.2 Sample aggregation by phenotype

# --- Euclidean distance, method ward.D2 ---
hcwd2_all <- hclust(dist(t(expression_all)),method="ward.D2")
dendwd2_all <- as.dendrogram(hcwd2_all)
orderwd2_all<-unlist(dendwd2_all)

condition<-c(pData(brca.rma)$Phenotype)
condition<-condition[orderwd2_all]
condition<-factor(condition,c('BRCA2.SA','BRCA1.AF','BRCA2.AF','NOMUT.AF','NOMUT.SA','BRCA1.SA'))
condition_colors <- colors[as.numeric(factor(condition))]

dendwd2_all %>% set("labels_col", condition_colors) %>% # change color
  set("labels_cex", 0.7) %>% # Change size
  plot(main = "Dendrogram") # plot
legend("topright", legend = unique(condition), fill = colors)

No aggregation of samples based on their phenotype condition.

5.3 PCA

As there is a very large number of variables (probe sets), we will use the principal component analysis (PCA) to reduce the dimensionality of the data.

summary(pca.filt <- prcomp(t(expression_all), scale=T ))
## Importance of components:
##                            PC1      PC2      PC3      PC4     PC5      PC6
## Standard deviation     79.6633 68.48597 55.90633 48.04736 44.3012 40.31132
## Proportion of Variance  0.1161  0.08579  0.05717  0.04222  0.0359  0.02972
## Cumulative Proportion   0.1161  0.20186  0.25902  0.30125  0.3371  0.36686
##                             PC7      PC8      PC9     PC10     PC11     PC12
## Standard deviation     36.90297 35.81462 33.42889 31.93710 29.42407 27.27534
## Proportion of Variance  0.02491  0.02346  0.02044  0.01866  0.01583  0.01361
## Cumulative Proportion   0.39177  0.41523  0.43567  0.45433  0.47016  0.48377
##                            PC13     PC14     PC15     PC16    PC17     PC18
## Standard deviation     26.07455 25.14259 24.56020 24.19572 23.1434 22.55918
## Proportion of Variance  0.01243  0.01156  0.01103  0.01071  0.0098  0.00931
## Cumulative Proportion   0.49620  0.50776  0.51880  0.52950  0.5393  0.54861
##                            PC19     PC20     PC21     PC22     PC23    PC24
## Standard deviation     22.35080 21.77851 21.38359 21.13638 20.76768 20.5149
## Proportion of Variance  0.00914  0.00867  0.00836  0.00817  0.00789  0.0077
## Cumulative Proportion   0.55774  0.56642  0.57478  0.58295  0.59084  0.5985
##                            PC25     PC26     PC27     PC28     PC29     PC30
## Standard deviation     20.42670 20.06158 19.82712 19.57890 19.50333 19.43784
## Proportion of Variance  0.00763  0.00736  0.00719  0.00701  0.00696  0.00691
## Cumulative Proportion   0.60617  0.61353  0.62072  0.62773  0.63469  0.64160
##                            PC31     PC32     PC33     PC34     PC35     PC36
## Standard deviation     19.11858 19.06093 18.82289 18.61306 18.59403 18.49415
## Proportion of Variance  0.00669  0.00665  0.00648  0.00634  0.00632  0.00626
## Cumulative Proportion   0.64829  0.65493  0.66141  0.66775  0.67407  0.68033
##                            PC37     PC38     PC39     PC40     PC41     PC42
## Standard deviation     18.29260 18.21857 18.12058 18.06825 17.90857 17.85582
## Proportion of Variance  0.00612  0.00607  0.00601  0.00597  0.00587  0.00583
## Cumulative Proportion   0.68645  0.69252  0.69852  0.70449  0.71036  0.71619
##                            PC43     PC44    PC45     PC46     PC47     PC48
## Standard deviation     17.79625 17.76590 17.6496 17.58135 17.48383 17.32658
## Proportion of Variance  0.00579  0.00577  0.0057  0.00565  0.00559  0.00549
## Cumulative Proportion   0.72198  0.72776  0.7335  0.73911  0.74470  0.75019
##                            PC49     PC50     PC51     PC52     PC53     PC54
## Standard deviation     17.29437 17.11942 17.06561 17.03125 16.95984 16.90206
## Proportion of Variance  0.00547  0.00536  0.00533  0.00531  0.00526  0.00523
## Cumulative Proportion   0.75566  0.76102  0.76635  0.77165  0.77691  0.78214
##                            PC55     PC56    PC57     PC58     PC59     PC60
## Standard deviation     16.84337 16.78317 16.6908 16.57630 16.54249 16.44427
## Proportion of Variance  0.00519  0.00515  0.0051  0.00503  0.00501  0.00495
## Cumulative Proportion   0.78733  0.79248  0.7976  0.80260  0.80761  0.81255
##                           PC61     PC62     PC63    PC64     PC65     PC66
## Standard deviation     16.3677 16.31774 16.28260 16.1967 16.08422 16.01629
## Proportion of Variance  0.0049  0.00487  0.00485  0.0048  0.00473  0.00469
## Cumulative Proportion   0.8175  0.82232  0.82717  0.8320  0.83670  0.84139
##                            PC67    PC68     PC69     PC70    PC71     PC72
## Standard deviation     15.88124 15.8544 15.77516 15.74969 15.6804 15.63660
## Proportion of Variance  0.00461  0.0046  0.00455  0.00454  0.0045  0.00447
## Cumulative Proportion   0.84600  0.8506  0.85515  0.85969  0.8642  0.86866
##                            PC73     PC74     PC75     PC76    PC77     PC78
## Standard deviation     15.59886 15.49399 15.43644 15.38978 15.3260 15.29705
## Proportion of Variance  0.00445  0.00439  0.00436  0.00433  0.0043  0.00428
## Cumulative Proportion   0.87311  0.87750  0.88186  0.88619  0.8905  0.89477
##                            PC79     PC80     PC81     PC82     PC83     PC84
## Standard deviation     15.26197 15.12645 15.10216 15.08316 15.06974 14.98167
## Proportion of Variance  0.00426  0.00418  0.00417  0.00416  0.00415  0.00411
## Cumulative Proportion   0.89903  0.90321  0.90738  0.91154  0.91570  0.91980
##                            PC85     PC86     PC87     PC88     PC89     PC90
## Standard deviation     14.93788 14.87251 14.85803 14.76513 14.70986 14.70342
## Proportion of Variance  0.00408  0.00405  0.00404  0.00399  0.00396  0.00395
## Cumulative Proportion   0.92388  0.92793  0.93197  0.93595  0.93991  0.94387
##                            PC91     PC92     PC93     PC94     PC95     PC96
## Standard deviation     14.63264 14.62300 14.58208 14.56691 14.51330 14.40038
## Proportion of Variance  0.00392  0.00391  0.00389  0.00388  0.00385  0.00379
## Cumulative Proportion   0.94778  0.95169  0.95558  0.95946  0.96332  0.96711
##                            PC97     PC98     PC99    PC100    PC101    PC102
## Standard deviation     14.36930 14.34839 14.28021 14.26261 14.16653 14.08077
## Proportion of Variance  0.00378  0.00377  0.00373  0.00372  0.00367  0.00363
## Cumulative Proportion   0.97089  0.97465  0.97838  0.98210  0.98577  0.98940
##                           PC103    PC104    PC105     PC106
## Standard deviation     13.97735 13.90222 13.82163 3.165e-13
## Proportion of Variance  0.00357  0.00353  0.00349 0.000e+00
## Cumulative Proportion   0.99297  0.99651  1.00000 1.000e+00

The first principal component explains just 11,6% of the variability of the data, the second PC, around 8,5%, and the third PC 5,7%, which is very few variability.

# --- PCA by phenotype ---
library(factoextra)
fviz_pca_ind(pca.filt,col.ind = pData(brca.rma)$Phenotype,addEllipses = TRUE,geom.ind = 'point')

# ---PCA by disease ---
fviz_pca_ind(pca.filt,col.ind = pData(brca.rma)$Disease,addEllipses = TRUE,geom.ind = 'point')

With the first two PC around 20% of the variability of the data is explained, which is very low.

The ellipses overlap each other, which implies similar gene expression profiles between each condition.

To sum up this part, after performing hierarchical clustering and PCA, there is no good aggregation of the samples, and there is not a specific gene profile for each condition.

6 Annotation

In this step, we will assign the identifiers to known annotations. To do this, we will need the corresponding R annotation package to the microarray technology that was used. This R package corresponds to the Affymetrix HG-U133_Plus_2 Array annotation data (chip hgu133plus2). The version of this package is the 3.13.0.

As there are different probe sets in a microarray that correpond to the same gene, we will use the probe ID plus the gene symbol as the row name of the expression matrix, by now. There might be the presence of Not Available (NA) annotations for certain probe sets, and these probes will be removed from the experiment.

library(annotate)
library(hgu133plus2.db)

# --- Annotation ---
data<-rownames(exprs(brca.rma))
symbol_data <-mget(data, env = hgu133plus2SYMBOL)

annotated_data<-paste(data,symbol_data,sep = '_')
rownames(brca.rma)<-annotated_data


# --- Remove NA in the annotation ---
expression_brca<-exprs(brca.rma)
pos_all_NA<-grep('_NA$',rownames(expression_brca))
brca.rma_noNA<-brca.rma[-pos_all_NA,]

dim(brca.rma)
## Features  Samples 
##    54675      106
dim(brca.rma_noNA) # remove 11574 NA
## Features  Samples 
##    43101      106
length(pos_all_NA)
## [1] 11574

We can see that when using this R package, there are 11574 probes sets that are not annotated (NA). In order to reduce the number of NA, and to try to get the maximum information as possible, we will use the annotation file provided by the microarray manufacturer, which is Thermo Fisher. This annotation file contains information for every probe set, such as the gene symbol and we will check whether the NA are indeed NA in the Thermo fhisher file or if we get some annotated genes for those non annotated probe sets using the R package.

6.1 Check NA probe sets

library(dplyr)
library(readr)
HG_U133_Plus_2_na36_annot <- read_csv("HG-U133_Plus_2.na36.annot.csv", skip = 25)
## Rows: 54675 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (41): Probe Set ID, GeneChip Array, Species Scientific Name, Annotation ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(HG_U133_Plus_2_na36_annot)[1:5,c(1,14:17)]
## # A tibble: 5 × 5
##   `Probe Set ID` `Gene Title`               `Gene Symbol` `Chromosomal Location`
##   <chr>          <chr>                      <chr>         <chr>                 
## 1 1007_s_at      discoidin domain receptor… DDR1          chr6p21.3             
## 2 1053_at        replication factor C subu… RFC2          chr7q11.23            
## 3 117_at         heat shock 70kDa protein … HSPA6         chr1q23               
## 4 121_at         paired box 8               PAX8          chr2q13               
## 5 1255_g_at      guanylate cyclase activat… GUCA1A        chr6p21.1             
## # ℹ 1 more variable: `Unigene Cluster Type` <chr>
# Get NA annotated probe names (of hgu133plus2.db R package)
NA_values<-grep('_NA$',rownames(expression_brca),value=TRUE)# probe set names of the no annotated 
NA_values<-gsub('_NA','',NA_values)

#select the gene names in the thermo-fisher annotation file of the R package NA values: 
pos_NA<-which(HG_U133_Plus_2_na36_annot$`Probe Set ID` %in% NA_values)
HG_na<-HG_U133_Plus_2_na36_annot[pos_NA,]

#Select only the columns Probe set ID and the Gene symbol
HG_gene_symbol_NA<-as.data.frame(HG_na[,c("Probe Set ID", "Gene Symbol")])

head(NA_values)# probe set names of the no annotated with R package 
## [1] "1552258_at"   "1552283_s_at" "1552401_a_at" "1552411_at"   "1552412_a_at"
## [6] "1552449_a_at"
head(HG_gene_symbol_NA) # Probe set ID + gene symbol of only those probe sets that were no annotated previously
##   Probe Set ID           Gene Symbol
## 1   1552258_at                   ---
## 2 1552283_s_at  ZDHHC11 /// ZDHHC11B
## 3 1552401_a_at   BACH1 /// GRIK1-AS2
## 4   1552411_at DEFB106A /// DEFB106B
## 5 1552412_a_at DEFB106A /// DEFB106B
## 6 1552449_a_at   SCGB1C1 /// SCGB1C2

This data frame contains the probe sets and the gene symbol of the thermo fisher file that were not annotated using the R package. As it is seen there are some probes that keep with no annotation (—), and others that have indeed an annotation.

#need to order the HG data frame based on the order of the NA_values order
index <- match(NA_values, HG_gene_symbol_NA$`Probe Set ID`)
HG_gene_symbol_NA_ordered <- HG_gene_symbol_NA[index, ]

identical(NA_values,HG_gene_symbol_NA_ordered$`Probe Set ID`)
## [1] TRUE
# Probe sets that are not annotated neither using R package and the thermo fisher file
dim(HG_gene_symbol_NA_ordered[HG_gene_symbol_NA_ordered$`Gene Symbol`=='---',])
## [1] 9390    2
# Probe sets that are being annotated when using the thermo fisher file and were not annotated with the R package
HG_gene_name_ofNAexpressionset<-HG_gene_symbol_NA_ordered[HG_gene_symbol_NA_ordered$`Gene Symbol`!='---',]
dim(HG_gene_name_ofNAexpressionset)
## [1] 2184    2
head(HG_gene_name_ofNAexpressionset)
##   Probe Set ID           Gene Symbol
## 2 1552283_s_at  ZDHHC11 /// ZDHHC11B
## 3 1552401_a_at   BACH1 /// GRIK1-AS2
## 4   1552411_at DEFB106A /// DEFB106B
## 5 1552412_a_at DEFB106A /// DEFB106B
## 6 1552449_a_at   SCGB1C1 /// SCGB1C2
## 8   1552607_at   FAM223A /// FAM223B
#2184 probe sets with annotation that have to be substituted in the expressionSet object

Of the 11574 no annotated probes using the R package ‘hgu133plus2.db’, there are 9390 probe sets that will remain no annotated, whereas 2184 probe sets have an annotation.

In microarrays, a probe set can have more than one annotated gene associated with it, as it seen in the above table. Since each probe set contains multiple individual probes with varying sequences, they can target a region that is shared by some genes or transcripts isoforms. This occurs when there are regions of sequence similarity or overlap among different genes, so, the probe set can hybridize with multiple transcripts or genes.

To deal with this, we will get the first gene symbol identification for those probes that have multiple gene symbols annotated.

# ----  Get the first Gene Symbol ----
# Create an empty vector to store the first values
split_values <- character(length(HG_gene_name_ofNAexpressionset$`Gene Symbol`))
for (i in 1:nrow(HG_gene_name_ofNAexpressionset)){
  split_values[i] <- strsplit(HG_gene_name_ofNAexpressionset$`Gene Symbol`, " /// ")[[i]][1]
  }

HG_gene_name_ofNAexpressionset$Unique_Gene_symbol<-unlist(split_values)

Once we have a unique gene symbol for each probe set, we have to substitute them for the NA symbol in the expression matrix.

# Get the gene names and substitute them instead of NA
change_NA_values<-intersect(NA_values,HG_gene_name_ofNAexpressionset$`Probe Set ID`)
new_probesets_anotated<-paste(change_NA_values,HG_gene_name_ofNAexpressionset$Unique_Gene_symbol,sep = '_')

pos_inthematrix_that_need_tobechange<-which(data %in% change_NA_values)

#Subset the expression matrix of those probe sets its name has to be changed
matrix_subset <- as.matrix(expression_brca[pos_inthematrix_that_need_tobechange, ])#this contains the intensity values of the probes sets that its gene symbol has been change
rownames(matrix_subset) <- new_probesets_anotated

expression_brca[pos_inthematrix_that_need_tobechange,][1:5,1:5]
##                 SG-17.CEL SG-18.CEL SG-19.CEL SG-20.CEL SG-21.CEL
## 1552283_s_at_NA  3.333612  2.915651  2.836237  3.003430  2.911392
## 1552401_a_at_NA  3.121932  2.944171  3.002741  2.970200  3.212224
## 1552411_at_NA    4.400499  4.726635  5.121501  4.918845  4.851512
## 1552412_a_at_NA  2.742122  2.444967  2.608570  2.716073  2.581393
## 1552449_a_at_NA  2.677066  2.535201  2.727358  2.794929  2.653370
head(matrix_subset)[1:5,1:5]
##                       SG-17.CEL SG-18.CEL SG-19.CEL SG-20.CEL SG-21.CEL
## 1552283_s_at_ZDHHC11   3.333612  2.915651  2.836237  3.003430  2.911392
## 1552401_a_at_BACH1     3.121932  2.944171  3.002741  2.970200  3.212224
## 1552411_at_DEFB106A    4.400499  4.726635  5.121501  4.918845  4.851512
## 1552412_a_at_DEFB106A  2.742122  2.444967  2.608570  2.716073  2.581393
## 1552449_a_at_SCGB1C1   2.677066  2.535201  2.727358  2.794929  2.653370

We can see that the probe set names that were annotated in the thermo fisher file and didn’t in the R package, have been changed.

Now, we want to know how many probe sets are annotated in both R package and Thermo fisher file.

# Get the probe sets that were annotated using the R package.
probsets_well_annotated_R_pack<-rownames(exprs(brca.rma_noNA))

# Split the vector into probe names and gene names using the last underscore
split_probesets <- strsplit(probsets_well_annotated_R_pack, "_(?!.*_)", perl = TRUE)

# Extract the probe set names and the gene names and create a data frame
probe_names<-NULL
gene_names<-NULL
for (i in 1:length(split_probesets)){
  probe_names[i]<-split_probesets[[i]][1]
  gene_names[i]<-split_probesets[[i]][2]
 }
df_annotated_probes_R_pack <- data.frame(ProbeName = probe_names, GeneName = gene_names)
head(df_annotated_probes_R_pack)
##   ProbeName GeneName
## 1 1007_s_at     DDR1
## 2   1053_at     RFC2
## 3    117_at    HSPA6
## 4    121_at     PAX8
## 5 1255_g_at   GUCA1A
## 6   1294_at     UBA7
# To know the position of the probes that are annotated in both data frames.
x<-which(HG_U133_Plus_2_na36_annot$`Probe Set ID`%in%df_annotated_probes_R_pack$ProbeName)
df_TF<-HG_U133_Plus_2_na36_annot[x,c("Probe Set ID", "Gene Symbol")]

#Check the probe set names are in the same order than the df of thermo.fisher
identical(df_annotated_probes_R_pack$ProbeName,df_TF$`Probe Set ID`)
## [1] FALSE
index <- match(df_annotated_probes_R_pack$ProbeName, df_TF$`Probe Set ID`)
df_TF_or <- df_TF[index, ]

identical(df_annotated_probes_R_pack$ProbeName,df_TF_or$`Probe Set ID`)
## [1] TRUE
# How many of the annotated probes sets using the R package, are no annotated in the TF file
dim(df_TF_or[df_TF_or$`Gene Symbol`=='---',])
## [1] 229   2
# There are 229 probe sets that were annotated using the R package and are not annotated in the TF file
dim(df_TF_or[df_TF_or$`Gene Symbol`!='---',])
## [1] 42872     2
# There are 42872 probe sets that were annotated using the R package, and keep being annotated in the TF file

There are 229 probe sets that were annotated when using the R package, and are no annotated in the thermo fisher file, and 42872 probe sets that are annotated in the R package and keep being annotated in the thermo fisher file. This results are summarized in the following confusion matrix.

# --- Confusion matrix ---
confusion_matrix<-data.frame(Annotated_TF_file=c(dim(df_TF_or[df_TF_or$`Gene Symbol`!='---',])[1],dim(HG_gene_name_ofNAexpressionset)[1]               ),No_annotated_TF_file=c(dim(df_TF_or[df_TF_or$`Gene Symbol`=='---',])[1],dim(HG_gene_symbol_NA_ordered[HG_gene_symbol_NA_ordered$`Gene Symbol`=='---',])[1]),row.names = c('Annotated_R_package','No_annotated_R_package'))

confusion_matrix
##                        Annotated_TF_file No_annotated_TF_file
## Annotated_R_package                42872                  229
## No_annotated_R_package              2184                 9390

It is important to check whether the probes that are annotated in both R package and in the thermo fisher file, have the same gene symbol. To do so, we will calculate the percentage of the gene symbols that are identical.

# Select the 42872 probe sets.
df_annotated2_TF<-df_TF_or[df_TF_or$`Gene Symbol`!='---',]
poss<-which(df_annotated_probes_R_pack$ProbeName %in% df_annotated2_TF$`Probe Set ID`)
df_annotated_probes_R2<-df_annotated_probes_R_pack[poss,]

# The probe sets need to be in the same order, otherwise, we will get a false percentage
identical(df_annotated2_TF$`Probe Set ID`,df_annotated_probes_R2$ProbeName)
## [1] TRUE
rownames(df_annotated_probes_R2)<-1:nrow(df_annotated_probes_R2)

# Select the probe sets that have more than one gene symbol annotation, and continue with the first annotation.
pos_multiplegenes<-grep('///',df_annotated2_TF$`Gene Symbol`)
df_TF_multiplegenes<-df_annotated2_TF[pos_multiplegenes,]

split_values2 <- NULL
for (i in 1:nrow(df_TF_multiplegenes)){
  split_values2[i] <- strsplit(df_TF_multiplegenes$`Gene Symbol`, " /// ")[[i]][1]
  }
df_TF_multiplegenes$Unique_Gene_symbol<-unlist(split_values2)

# Create a data frame with the probe name and with only one Gene symbol annotation.
merged_df <- merge(df_annotated2_TF, df_TF_multiplegenes, by = "Probe Set ID", all.x = TRUE)
merged_df<-merged_df[,-3]
# Update the Gene column with the values from the second data frame
merged_df$Gene <- ifelse(is.na(merged_df$Unique_Gene_symbol), merged_df$`Gene Symbol.x`, merged_df$Unique_Gene_symbol)
merged_df <- merged_df[, c("Probe Set ID", "Gene")]

# Find the number of matching names in the same position
identical(df_annotated_probes_R2$ProbeName,merged_df$`Probe Set ID`) #make sure every ID is the same in both df
## [1] TRUE
sum(df_annotated_probes_R2$GeneName==merged_df$Gene)/length(merged_df$Gene)*100
## [1] 92.7925

There are 42872 probe sets that are annotated in both ways, using the R package and the annotation file. From this probe sets, 92,79% have exactly the same Gene Symbol. 3090 probe sets have a different annotated gene name, but that doesn’t mean that the gene is different. As we can see next, there are some probe sets that refer to the same gene, but the Gene Symbol is the previous used or a synonymous. It can may happen that, for a probe set, more than one possible gene is identified. In this analysis we keep with the first annotated gene, but maybe the R package is selecting the second of the TF file.

#How many probe ID are differently annotated
sum(df_annotated_probes_R2$GeneName!=merged_df$Gene)
## [1] 3090
# Example of probe sets with different Gene
grep('1558480_at',merged_df$`Probe Set ID`)
## [1] 3536
df_annotated_probes_R2[3536,]
##       ProbeName GeneName
## 3536 1558480_at TMCC1-DT
merged_df[3536,]
##      Probe Set ID      Gene
## 3536   1558480_at TMCC1-AS1
df_annotated_probes_R2[6707,]
##       ProbeName  GeneName
## 6707 1569973_at SEPTIN7P2
merged_df[6707,]
##      Probe Set ID    Gene
## 6707   1569973_at SEPT7P2

For example, the probe set 1558480_at in the R hgu133plus2.db package is annotated as TMCC1-DT. However, in the Thermo fisher annotation file, is as TMCC1-AS1. As it indicates the HUGO Gene Nomenclature Committee (HGNC), the approved symbol is TMCC1-DT, but the previous one was TMCC1-AS1. The same occurs with the probe set 1569973_at. The approved symbol is SEPTIN7P2 and the previous one was SEPT7P2. So, both annotation methods make reference to the same gene, although the name is different and this affects the percentage of same gene symbols between the annotated probe sets in R and in the Thermo fisher. In conclusion, the percentage of gene symbols shared by the R hgu133plus2.db package and the Thermo fisher file, is indeed higher than 92,72%.

Let’s check whether the genomic position of the 229 non-annotated probes using the thermo fisher file that were annotated using R, corresponds to the gene that is annotated. We will use the UCSC Genome browser on Human, version GRCh37/hg19.

# Select the probes annotated in R but not in the Thermo fisher file.
TF_no_annotated_yesR<-df_TF_or[df_TF_or$`Gene Symbol`=='---',]
probes_id<-TF_no_annotated_yesR$`Probe Set ID`
rownames(HG_U133_Plus_2_na36_annot)<-HG_U133_Plus_2_na36_annot$`Probe Set ID`
head(HG_U133_Plus_2_na36_annot[probes_id,c(1,13:16)])
## # A tibble: 6 × 5
##   `Probe Set ID` Alignments    `Gene Title` `Gene Symbol` `Chromosomal Location`
##   <chr>          <chr>         <chr>        <chr>         <chr>                 
## 1 1552976_at     chr11:736747… ---          ---           ---                   
## 2 1553208_s_at   chr5:1757925… ---          ---           ---                   
## 3 1553547_at     chr13:799505… ---          ---           ---                   
## 4 1554194_at     chr8:2246064… ---          ---           ---                   
## 5 1554448_at     ---           ---          ---           ---                   
## 6 1556486_at     chr10:476560… ---          ---           ---
rownames(df_annotated_probes_R_pack)<-df_annotated_probes_R_pack$ProbeName
head(df_annotated_probes_R_pack[probes_id,])
##                 ProbeName     GeneName
## 1552976_at     1552976_at      DNAJB13
## 1553208_s_at 1553208_s_at        ARL10
## 1553547_at     1553547_at        RBM26
## 1554194_at     1554194_at LOC107986876
## 1554448_at     1554448_at          JPX
## 1556486_at     1556486_at LOC105378577

So, the genomic location of the probe sets is in the annotation file. We will select the the coordinates for each probe, search them in the USCS genome browser, and compare if the gene is the same that annotated using the R package.

For the first probe, 1552976_at, the genomic coordinates correspond to the gene DNAJB13, which is the same that was annotated.

For the probe 1554194_at, we can see that the genomic coordinates corresponds to C8orf58 gene, and the gene annotated in the R package is a LOC. In this case, the gene do not match.

After analyzing and comparing the annotations by the two different approaches, we decide to keep with the downstream analysisi with those probes sets annotated by the Thermo Fisher file. Since the annotation file is provided by the microarray manufacturer, it is considered an official, curated and reliable source of information, ensuring accuracy. On the other hand, the R package for probe annotations is typically maintained by researchers or bioinformatics, which are more versatile.

In conclusion, we will keep the analysis with the probe sets that were annotated using the Thermo fisher annotation file. In total, we retrieve 45056 probe sets.

annotated_probes_TF<-HG_U133_Plus_2_na36_annot[HG_U133_Plus_2_na36_annot$`Gene Symbol`!='---',]
annotated_probes_TF2<-annotated_probes_TF[,c("Probe Set ID", "Gene Symbol")]

# Select the probe sets that have more than one gene symbol annotation, and continue with the first annotation.
pos_multiplegenes<-grep('///',annotated_probes_TF2$`Gene Symbol`)
df_TF_multiplegenes<-annotated_probes_TF2[pos_multiplegenes,]

split_values2 <- NULL
for (i in 1:nrow(df_TF_multiplegenes)){
  split_values2[i] <- strsplit(df_TF_multiplegenes$`Gene Symbol`, " /// ")[[i]][1]
  }
df_TF_multiplegenes$Unique_Gene_symbol<-unlist(split_values2)

# Create a data frame with the probe name and with only one Gene symbol annotation.
merged_df <- merge(annotated_probes_TF2, df_TF_multiplegenes, by = "Probe Set ID", all.x = TRUE)
merged_df<-merged_df[,-3]
# Update the Gene column with the values from the second data frame
merged_df$Gene <- ifelse(is.na(merged_df$Unique_Gene_symbol), merged_df$`Gene Symbol.x`, merged_df$Unique_Gene_symbol)
merged_df <- merged_df[, c("Probe Set ID", "Gene")]


pos_probes_expressionmatrix<-which(data %in% merged_df$`Probe Set ID`)
final_expression_matrix <- as.matrix(expression_brca[pos_probes_expressionmatrix, ])

# We need to change the rownames for the probe names of the Thermo fisher file
new_probesets_<-paste(merged_df$`Probe Set ID`,merged_df$Gene,sep = '_')
rownames(final_expression_matrix)<-new_probesets_

dim(final_expression_matrix)
## [1] 45056   106

6.2 Remove duplicated probe set genes

It is frequent that different probe sets correspond to the same gene. Once the probe sets are annotated, we have to remove duplicated genes. Otherwise, the analysis can lead to biased and inaccurate results.

The implemented function, will select the expression values of the duplicated genes, and will compute the mean value for each individual. It will return a list containing each repeated gene, and the mean expression values for each sample. Finally, we will create a matrix, that contains the expression values of both unique and previous duplicated genes.

# --- REMOVE DUPLICATED ----

symbols<-merged_df$Gene

# ----- FUNCTION ----

find_repeated_genes <- function(vector, matrix) {
  # Find repeated gene names
  repeated_names <- names(table(vector))[table(vector) > 1]
  
  # Initialize a list to store the mean values
  repeated_means <- list()
  
  # Iterate over each repeated gene
  for (gene_name in repeated_names) {
    # Find the positions of the repeated gene in the vector
    positions <- which(vector == gene_name)
    
    # Find the rows in the matrix that correspond to the positions
    rows <- matrix[positions, , drop = FALSE]
    
    # Calculate the mean by column for the rows
    means <- colMeans(rows)
    
    # Store the means in the list
    repeated_means[[gene_name]] <- means
  }
  
  # Return the list of mean values for each repeated gene
  return(repeated_means)
}

# Call the function to find the rows of the repeated genes in the matrix
result <- find_repeated_genes(symbols, final_expression_matrix)

# Extract the gene names
gene_names <- names(result)

# Create a matrix with row names for genes and columns for samples
mat <- matrix(unlist(result), nrow = length(gene_names), byrow = TRUE)

# Add row names to the matrix
rownames(mat) <- gene_names
colnames(mat) <- colnames(exprs(brca.rma))

dim(mat)
## [1] 11459   106
# --- Get unique genes ---

# Create a table of gene counts
gene_counts <- table(symbols)

# Extract the genes that occur only once
unique_genes <- names(gene_counts[gene_counts == 1])

#Extract the position of the unique genes.
unique_gene_pos<-match(unique_genes,symbols)

# Search in the matrix, the position of the unique genes 
matrix_unique_genes<-final_expression_matrix[unique_gene_pos,]
rownames(matrix_unique_genes)<-gsub("^.*_", "",rownames(matrix_unique_genes))

# Join the two matrices
final_matrix<-rbind(matrix_unique_genes,mat)
brca.noduplicated<-final_matrix[order(rownames(final_matrix)),]
dim(brca.noduplicated)
## [1] 21923   106
# We need to transform the matrix object brca.noduplicated to an ExpressionSet
library(Biobase)
brca.noduplicated<-ExpressionSet(assayData = brca.noduplicated)
pData(brca.noduplicated)<-pdata

After removing the duplicated genes, we end up with a matrix that contains the expression levels of 21923 genes.

7 Deconvolution based method

Before applying deconvolution, we have to make sure that the input data is:

  1. Normalized

  2. not log-transformed.

The data was normalized in the previous steps, and the transformation of not log data will be performed next, when running the deconvolution. ## Cibersort

In order to use Cibersort, we require to have in local the cibersort source code, that contains two files. These files are downloaded from the cibersort website.

#Set the path to the directory were the deconvolution cibersort files are. 
deconvDir <- file.path("C:/Users/MARINA/Documents/MASTER/TFM/Deconvolution_files")
cibersort_binary = file.path(deconvDir, "CIBERSORT.R")
cibersort_mat = file.path(deconvDir, "CIBERSORT_LM22.txt")

CIBERSORT_LM22 <- read.delim("~/MASTER/TFM/Deconvolution_files/CIBERSORT_LM22.txt", row.names=1)


# Separate radiated from non-radiated
NOR<-pdata[pdata$Treatment=='NOR',]
RAD<-pdata[pdata$Treatment=='RAD',]
# 
radiated_samples<-rownames(RAD)
non_radiated_samples<-rownames(NOR)

Cibersort, allows to perform between cell-type comparisons. That means, for example, for sample SG-17.cel, there are more T cell CD4+ memory activated cells than T cell CD8+.

If we want to perform between sample comparisons, we need to use other methods, such as Timer, epic, or cibersort abs. mode (provides a score that can be compared between both samples and cell types).

7.1 Cibersort abs. mode

We will use cibersort abs. mode, which is a method that provides an score that allows for the comparison of both between cell types, and samples comparisons.

library(immunedeconv)
res_cib_abs=deconvolute_cibersort(2^exprs(brca.noduplicated),arrays = TRUE, absolute = TRUE)
#load(file='res_cis_abs.RData')

res_cib_abs <- data.frame(cell_type=rownames(res_cib_abs),res_cib_abs)
res_cib_abs <- res_cib_abs[,-1]
res_cib_abs <- res_cib_abs %>% mutate(across(where(is.numeric), round, 3))
colnames(res_cib_abs)<-gsub('_2','',colnames(res_cib_abs))
datatable(res_cib_abs) 
# ---  Load the data ---
library(readxl)
Patient_info <- as.data.frame(read_excel("../Patient info.xlsx"))

# --- Prepare it ---
# Change the row names for their array code identification
rownames(Patient_info)<-Patient_info$`ARRAY CODE`
rownames(Patient_info)<-gsub('LL','L',rownames(Patient_info))
rownames(Patient_info)<-gsub('SG-127.CEL','X127.CEL',rownames(Patient_info))
rownames(Patient_info)<-gsub('SG-13.CEL','SG-103.CEL',rownames(Patient_info))
rownames(Patient_info)<-gsub('SG-14.CEL','SG-104.CEL',rownames(Patient_info))
rownames(Patient_info)<-gsub('-','.',rownames(Patient_info))

ind<-match(colnames(res_cib_abs),row.names(Patient_info))
Patient_info<-Patient_info[ind,]
identical(colnames(res_cib_abs),row.names(Patient_info))
## [1] TRUE

Once we have obtained the table of the cell fractions for all the cell types and samples, we can start by assessing its variability in the different conditions we want to study.

In order to know which statistical test is the most appropriate one, we will start by performing a shapiro.test, to assess the normality of the data.

For each cell type, 2 hypothesis are made:

  • H0: Data follow a normal distribution.

  • H1: Data do not follow a normal distribution.

If p-value is greater than 0.05, data follows a normal distribution, whereas if the p-value is less than 0.05 data do not follow a normal distribution, and this will condition the type of test to be used.

# remove cell types that do not have at least 5 unique values
distinct_values_count <- apply(res_cib_abs, 1, function(x) length(unique(x)))

# Subset the data frame to include rows with at least 5 distinct values
df_filtered <- res_cib_abs[distinct_values_count >= 5, ]


test<-apply(df_filtered,1,function(x) shapiro.test(x)$p.value)
test
##                B cells naive               B cells memory 
##                 1.618935e-08                 7.276780e-22 
##                 Plasma cells                  T cells CD8 
##                 1.451799e-15                 1.417897e-03 
##            T cells CD4 naive T cells CD4 memory activated 
##                 4.519715e-02                 5.444164e-03 
##    T cells follicular helper   T cells regulatory (Tregs) 
##                 1.322877e-02                 1.751164e-05 
##          T cells gamma delta             NK cells resting 
##                 3.438464e-09                 9.647252e-07 
##           NK cells activated                    Monocytes 
##                 1.270532e-06                 2.364065e-16 
##               Macrophages M0               Macrophages M1 
##                 8.950692e-12                 4.846139e-02 
##               Macrophages M2    Dendritic cells activated 
##                 2.537247e-21                 2.898118e-01 
##         Mast cells activated                  Eosinophils 
##                 2.146870e-03                 4.009167e-11 
##                  Neutrophils   Absolute score (sig.score) 
##                 1.368962e-12                 8.255260e-02
notdistributed<-df_filtered

# Normal distribution 
hist(as.numeric(df_filtered[16,]))

# Not normal distribution
hist(as.numeric(df_filtered[8,]))

Only the data from dendritic cell type and the total score follows a normal distribution. As most of the cell types do not follow a normal distribution, we will treat all the data as not normal distributed, and we will apply a Wilcoxon for testing paired data for testing the equality of 2 means.

7.1.1 Not normally distributed data cell types

We will start by a general overview, so we will assess the mean differences between 2 general groups: radiated vs non-radiated samples.

In this code, the mean for each cell type is calculated by each group.

# Separate radiated from non-radiated
table(pdata$Treatment)
## 
## NOR RAD 
##  53  53
# vector of conditions
condition_rad_norad=pdata$Treatment

mean_values<-NULL
for (i in 1: nrow(notdistributed)){
    mean<-tapply((unlist(notdistributed[i,-107])),condition_rad_norad, mean)
    rownam<-rownames(notdistributed[i,])
    mean_values<-c(mean_values,c(rownam,mean))
    }


# Function to format the mean_values into groups of 3
format_mean_values <- function(values) {
  result <- list()
  num_values <- length(values)
  i <- 1
  while (i <= num_values) {
    result <- c(result, list(values[i:(i + 2)]))
    i <- i + 3
  }
  return(result)
}

mean_values <- format_mean_values(mean_values)

# Convert mean_values to a data frame
mean_values_df <- do.call(rbind, mean_values)
mean_values_df <- as.data.frame(mean_values_df)

# Rename the columns
colnames(mean_values_df) <- c("Condition", "NOR", "RAD")

mean_values_df$NOR<-round(as.numeric(mean_values_df$NOR),4)
mean_values_df$RAD<-round(as.numeric(mean_values_df$RAD),4)

mean_values_df
##                       Condition    NOR    RAD
## 1                 B cells naive 0.0349 0.0362
## 2                B cells memory 0.0005 0.0000
## 3                  Plasma cells 0.0012 0.0010
## 4                   T cells CD8 0.1307 0.1517
## 5             T cells CD4 naive 0.1066 0.1086
## 6  T cells CD4 memory activated 0.6404 0.5881
## 7     T cells follicular helper 0.0795 0.0914
## 8    T cells regulatory (Tregs) 0.0502 0.0721
## 9           T cells gamma delta 0.0266 0.0278
## 10             NK cells resting 0.0745 0.0964
## 11           NK cells activated 0.0747 0.0660
## 12                    Monocytes 0.0021 0.0013
## 13               Macrophages M0 0.0179 0.0388
## 14               Macrophages M1 0.0820 0.0753
## 15               Macrophages M2 0.0001 0.0006
## 16    Dendritic cells activated 0.0457 0.0575
## 17         Mast cells activated 0.3309 0.3438
## 18                  Eosinophils 0.0093 0.0102
## 19                  Neutrophils 0.0038 0.0045
## 20   Absolute score (sig.score) 1.7125 1.7730

As you can see there is no big differences between the mean cell fraction of no radiated and radiated samples. However, we will perform a wilcoxon test to assess if there are statistical differences.

Apply statistical tests:

# --- radiated vs no-radiated --------------
wilcox.test(unlist(notdistributed['T cells CD8',1:106])~condition_rad_norad)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  unlist(notdistributed["T cells CD8", 1:106]) by condition_rad_norad
## W = 1245, p-value = 0.315
## alternative hypothesis: true location shift is not equal to 0
pvals<-NULL
for (i in 1:nrow(notdistributed)){
  pval<-wilcox.test(unlist(notdistributed[i,])~condition_rad_norad)$p.val
  pvals<-c(pvals,pval)
}

notdistributed$pvals<-pvals

df<-notdistributed[,106:107] #106:107
df[-1]
##                                    pvals
## B cells naive                0.678843864
## B cells memory               0.301368216
## Plasma cells                 0.854854930
## T cells CD8                  0.315014508
## T cells CD4 naive            0.997478868
## T cells CD4 memory activated 0.163532322
## T cells follicular helper    0.197344908
## T cells regulatory (Tregs)   0.016781703
## T cells gamma delta          0.922832389
## NK cells resting             0.195350437
## NK cells activated           0.521461950
## Monocytes                    0.366632627
## Macrophages M0               0.010904862
## Macrophages M1               0.257924181
## Macrophages M2               0.134405612
## Dendritic cells activated    0.003795148
## Mast cells activated         0.525396182
## Eosinophils                  0.216770687
## Neutrophils                  0.805054934
## Absolute score (sig.score)   0.022541861

There are 3 p-values that are less than 0.05, which means that there are statistically evidences to reject the null hypothesis (H0) and thus, the alternative hypothesis (H1) is considered to be proved. So, our results provide support for the hypothesis that there are significant differences between the mean expression of the radiated and non radiated levels of T cells regulatory (Tregs), Macrophages MO, and the total absolute score.

In the other hand, the rest of the cell types present a p-value higher than 0.05, which indicates that there are no statistical significant differences between the mean expression levels of radiated and no-radiated samples.

7.1.1.1 Total CD4 cell type

The main interest of this study is the T CD4 cell type. As cibesort provides detailed information regarding them, and separates the lymphocytes depending on their subgroups (naive or memory activated),we will aggregate them by summing up their expression value for each sample.

# Select T cells CD4 naive
T_CD4_naive_ciber<-notdistributed['T cells CD4 naive',-107]#107
T_CD4_naive_ciber<-as.data.frame(t(T_CD4_naive_ciber))


# Select T cells CD4 memory activated
T_CD4_memory_act_ciber<-notdistributed['T cells CD4 memory activated',-107]
T_CD4_memory_act_ciber<-as.data.frame(t(T_CD4_memory_act_ciber))

# Select total T CD4 cells 
TCD4_cibersort<-notdistributed[c('T cells CD4 naive','T cells CD4 memory activated'),-107]

Total_T_CD4_ciber<-colSums(TCD4_cibersort)
Total_T_CD4_ciber<-as.data.frame(Total_T_CD4_ciber)
Total_T_CD4_ciber
##            Total_T_CD4_ciber
## SG.17.CEL              0.862
## SG.18.CEL              0.847
## SG.19.CEL              0.885
## SG.20.CEL              0.731
## SG.21.CEL              0.751
## SG.22.CEL              0.817
## SG.23.CEL              0.741
## SG.24.CEL              0.594
## SG.25.CEL              0.713
## SG.26.CEL              0.651
## SG.27.CEL              0.898
## SG.28.CEL              0.984
## SG.31.CEL              0.758
## SG.32.CEL              0.865
## SG.33.CEL              0.870
## SG.34.CEL              0.851
## SG.35.CEL              0.829
## SG.36.CEL              0.925
## SG.37.CEL              0.644
## SG.38.CEL              0.545
## SG.39.CEL              1.048
## SG.40.CEL              1.141
## SG.41.CEL              0.938
## SG.42.CEL              0.916
## SG.43.CEL              0.641
## SG.44.CEL              0.709
## SG.45.CEL              1.052
## SG.46.CEL              0.871
## SG.47.CEL              0.914
## SG.48.CEL              0.802
## SG.49.CEL              0.836
## SG.50.CEL              0.838
## SG.51.CEL              0.643
## SG.52.CEL              0.592
## SG.53.CEL              1.109
## SG.54.CEL              1.051
## SG.55.CEL              0.842
## SG.56.CEL              0.845
## SG.57.CEL              0.592
## SG.58.CEL              0.557
## SG.59.CEL              0.920
## SG.60.CEL              0.910
## SG.63.CEL              0.565
## SG.64.CEL              0.446
## SG.65.CEL              0.863
## SG.66.CEL              0.728
## SG.67.CEL              0.355
## SG.68.CEL              0.423
## SG.69.CEL              0.678
## SG.70.CEL              1.055
## SG.71.CEL              0.662
## SG.72.CEL              0.747
## SG.73.CEL              0.678
## SG.74.CEL              0.637
## SG.75.CEL              0.880
## SG.76.CEL              0.697
## SG.77.CEL              0.788
## SG.78.CEL              0.493
## SG.79.CEL              0.760
## SG.80.CEL              0.530
## SG.81.CEL              0.651
## SG.82.CEL              0.577
## SG.83.CEL              1.120
## SG.84.CEL              0.875
## SG.85.CEL              0.449
## SG.86.CEL              0.297
## SG.87.CEL              0.826
## SG.88.CEL              0.847
## SG.89.CEL              0.773
## SG.90.CEL              0.635
## SG.91.CEL              0.386
## SG.92.CEL              0.467
## SG.93.CEL              0.335
## SG.94.CEL              0.253
## SG.95.CEL              0.982
## SG.96.CEL              0.675
## SG.97.CEL              0.798
## SG.98.CEL              0.766
## SG.99.CEL              0.732
## SG.100.CEL             0.517
## SG.101.CEL             0.301
## SG.102.CEL             0.200
## SG.103.CEL             0.496
## SG.104.CEL             0.388
## SG.105.CEL             0.406
## SG.106.CEL             0.347
## SG.107.CEL             0.392
## SG.108.CEL             0.649
## SG.109.CEL             0.837
## SG.110.CEL             0.745
## SG.111.CEL             0.821
## SG.112.CEL             0.702
## SG.113.CEL             0.861
## SG.114.CEL             0.810
## SG.115.CEL             0.833
## SG.116.CEL             0.794
## SG.117.CEL             0.638
## SG.118.CEL             0.462
## SG.119.CEL             0.959
## SG.120.CEL             0.841
## SG.121.CEL             0.698
## SG.122.CEL             0.810
## SG.123.CEL             0.731
## SG.124.CEL             0.633
## X127.CEL               0.854
## SG.128.CEL             0.839

7.1.2 Sankey Plot

In order to visualize the proportion of cell types across samples, we will perform Sankey plots. To properly visualize the plots and to make them informative, we will separate the samples depending on their phenotype (BRCA1 healthy, BRCA1 affected, BRCA2 healthy, BRCA2 affected, NOMUT healthy and NOMUT affected) for no-radiated samples. We will also select those cell types that its mean values are higher than 0.05.

library(ggalluvial)
## Warning: package 'ggalluvial' was built under R version 4.3.1
noradiated<-rownames(pdata[pdata$Treatment=='NOR',])
noradiated2<-as.vector(gsub('-','.',noradiated))
noradiated2<-as.vector(gsub('127.CEL','X127.CEL',noradiated2))

res_norad<-notdistributed[,noradiated2]
sub_resnorad<-res_norad[-20,]
sub_resnorad<-sub_resnorad[which(apply(sub_resnorad,1,mean)>0.05),]
rownames(sub_resnorad)[3]<-'TCD4 memory activated'
rownames(sub_resnorad)[4]<-'T follicular helper'
rownames(sub_resnorad)[5]<-'T regulatory'


#----------------BRCA1 affected-------------------------------------
BRCA1.AF<-rownames(pdata[pdata$Phenotype=='BRCA1.AF',])
BRCA1.AF<-as.vector(gsub('-','.',BRCA1.AF))
pos<-which(colnames(sub_resnorad) %in% BRCA1.AF)

res_noradBRCA1.AF<-sub_resnorad[,pos]

# Put all data in a single vector
vectors<-NULL
for (i in 1:ncol(res_noradBRCA1.AF)){
  vector<-res_noradBRCA1.AF[,i]
  vectors<-c(vectors,vector)
}

data_plot <- data.frame(
  sample = rep(colnames(res_noradBRCA1.AF), each = 9),
  cell_type = rep(rownames(res_noradBRCA1.AF), times = 10),
  value=vectors)


ggplot(data = data_plot,
       aes(axis1 = sample, axis2 = cell_type, y = value)) +
  geom_alluvium(aes(fill = cell_type)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("sample", "Cell type"),
                   expand = c(0.15, 0.05)) +
  theme_void() + ggtitle('   Cell type distributions in BRCA1 cancer-affected samples')

#----------------BRCA1 healthy-------------------------------------
BRCA1.SA<-rownames(pdata[pdata$Phenotype=='BRCA1.SA',])
BRCA1.SA<-as.vector(gsub('-','.',BRCA1.SA))
BRCA1.SA<-as.vector(gsub('127.CEL','X127.CEL',BRCA1.SA))
pos<-which(colnames(sub_resnorad) %in% BRCA1.SA)

res_noradBRCA1.SA<-sub_resnorad[,pos]

# Put all data in a single vector
vectors<-NULL
for (i in 1:ncol(res_noradBRCA1.SA)){
  vector<-res_noradBRCA1.SA[,i]
  vectors<-c(vectors,vector)
}

data_plot.SA <- data.frame(
  sample = rep(colnames(res_noradBRCA1.SA), each = 9),
  cell_type = rep(rownames(res_noradBRCA1.SA), times = 8),
  value=vectors)


ggplot(data = data_plot.SA,
       aes(axis1 = sample, axis2 = cell_type, y = value)) +
  geom_alluvium(aes(fill = cell_type)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("sample", "Cell type"),
                   expand = c(0.15, 0.05)) +
  theme_void() +ggtitle('   Cell type distributions in BRCA1 healthy samples')

#----------------BRCA2 affected-------------------------------------
BRCA2.AF<-rownames(pdata[pdata$Phenotype=='BRCA2.AF',])
BRCA2.AF<-as.vector(gsub('-','.',BRCA2.AF))
pos<-which(colnames(sub_resnorad) %in% BRCA2.AF)

res_noradBRCA2.AF<-sub_resnorad[,pos]

# Put all data in a single vector
vectors<-NULL
for (i in 1:ncol(res_noradBRCA2.AF)){
  vector<-res_noradBRCA2.AF[,i]
  vectors<-c(vectors,vector)
}

data_plot2.AF <- data.frame(
  sample = rep(colnames(res_noradBRCA2.AF), each = 9),
  cell_type = rep(rownames(res_noradBRCA2.AF), times = 11),
  value=vectors)


ggplot(data = data_plot2.AF,
       aes(axis1 = sample, axis2 = cell_type, y = value)) +
  geom_alluvium(aes(fill = cell_type)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("sample", "Cell type"),
                   expand = c(0.15, 0.05)) +
  theme_void() +ggtitle('   Cell type distributions in BRCA2 cancer-affected samples')

#----------------BRCA2 healthy-------------------------------------
BRCA2.SA<-rownames(pdata[pdata$Phenotype=='BRCA2.SA',])
BRCA2.SA<-as.vector(gsub('-','.',BRCA2.SA))
pos<-which(colnames(sub_resnorad) %in% BRCA2.SA)

res_noradBRCA2.SA<-sub_resnorad[,pos]

# Put all data in a single vector
vectors<-NULL
for (i in 1:ncol(res_noradBRCA2.SA)){
  vector<-res_noradBRCA2.SA[,i]
  vectors<-c(vectors,vector)
}

data_plot2.SA <- data.frame(
  sample = rep(colnames(res_noradBRCA2.SA), each = 9),
  cell_type = rep(rownames(res_noradBRCA2.SA), times = 9),
  value=vectors)


ggplot(data = data_plot2.SA,
       aes(axis1 = sample, axis2 = cell_type, y = value)) +
  geom_alluvium(aes(fill = cell_type)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("sample", "Cell type"),
                   expand = c(0.15, 0.05)) +
  theme_void()+ggtitle('   Cell type distributions in BRCA2 healthy samples')

#----------------NOMUT healthy-------------------------------------
NOMUT.SA<-rownames(pdata[pdata$Phenotype=='NOMUT.SA',])
NOMUT.SA<-as.vector(gsub('-','.',NOMUT.SA))
pos<-which(colnames(sub_resnorad) %in% NOMUT.SA)

res_noradNOMUT.SA<-sub_resnorad[,pos]

# Put all data in a single vector
vectors<-NULL
for (i in 1:ncol(res_noradNOMUT.SA)){
  vector<-res_noradNOMUT.SA[,i]
  vectors<-c(vectors,vector)
}

data_plotNOMUT.SA <- data.frame(
  sample = rep(colnames(res_noradNOMUT.SA), each = 9),
  cell_type = rep(rownames(res_noradNOMUT.SA), times = 10),
  value=vectors)


ggplot(data = data_plotNOMUT.SA,
       aes(axis1 = sample, axis2 = cell_type, y = value)) +
  geom_alluvium(aes(fill = cell_type)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("sample", "Cell type"),
                   expand = c(0.15, 0.05)) +
  theme_void()+ggtitle('   Cell type distributions in NO MUTATED healthy samples')

#----------------NOMUT affected-------------------------------------
NOMUT.AF<-rownames(pdata[pdata$Phenotype=='NOMUT.AF',])
NOMUT.AF<-as.vector(gsub('-','.',NOMUT.AF))
pos<-which(colnames(sub_resnorad) %in% NOMUT.AF)

res_noradNOMUT.AF<-sub_resnorad[,pos]

# Put all data in a single vector
vectors<-NULL
for (i in 1:ncol(res_noradNOMUT.AF)){
  vector<-res_noradNOMUT.AF[,i]
  vectors<-c(vectors,vector)
}

data_plotNOMUT.AF <- data.frame(
  sample = rep(colnames(res_noradNOMUT.AF), each = 9),
  cell_type = rep(rownames(res_noradNOMUT.AF), times = 5),
  value=vectors)


ggplot(data = data_plotNOMUT.AF,
       aes(axis1 = sample, axis2 = cell_type, y = value)) +
  geom_alluvium(aes(fill = cell_type)) +
  geom_stratum() +
  geom_text(stat = "stratum",
            aes(label = after_stat(stratum))) +
  scale_x_discrete(limits = c("sample", "Cell type"),
                   expand = c(0.15, 0.05)) +
  theme_void()+ggtitle('   Cell type distributions in NO MUTATED cancer-affected samples')

7.2 EPIC

We will also perform deconvolution using the EPIC method.

res_epic2 = deconvolute(2^exprs(brca.noduplicated),array=TRUE , "epic") 
## 
## >>> Running epic
res_epic <- res_epic2 %>% mutate(across(where(is.numeric), round, 3))
datatable(res_epic) 
res_epic<-as.data.frame(res_epic)
rownames(res_epic)<-res_epic$cell_type
res_epic<-res_epic[,-1]

test<-apply(res_epic,1,function(x) shapiro.test(x)$p.value)
test
##                       B cell Cancer associated fibroblast 
##                 9.297252e-15                 8.000092e-23 
##                  T cell CD4+                  T cell CD8+ 
##                 1.563248e-09                 8.256815e-22 
##             Endothelial cell                   Macrophage 
##                 4.152440e-20                 3.230597e-03 
##                      NK cell         uncharacterized cell 
##                 1.095635e-02                 2.438233e-20
pos<-which(test<0.05)
notdistributed<-res_epic[pos,]

In this case, all the cell types are not normal distributed. We will perform the same procedure that was done on Cibersort abs. mode: assessing the mean differences between 2 general groups: radiated vs non-radiated samples.

# --- radiated vs no-radiated --------------

# Separate radiated from non-radiated
table(pdata$Treatment)
## 
## NOR RAD 
##  53  53
# vector of conditions
condition_rad_norad=pdata$Treatment

mean_values<-NULL
for (i in 1: nrow(notdistributed)){
    mean<-tapply((unlist(notdistributed[i,-107])),condition_rad_norad, mean)
    rownam<-rownames(notdistributed[i,])
    mean_values<-c(mean_values,c(rownam,mean))
    }

# Function to format the mean_values into groups of 3
format_mean_values <- function(values) {
  result <- list()
  num_values <- length(values)
  i <- 1
  while (i <= num_values) {
    result <- c(result, list(values[i:(i + 2)]))
    i <- i + 3
  }
  return(result)
}

mean_values <- format_mean_values(mean_values)

# Convert mean_values to a data frame
mean_values_df <- do.call(rbind, mean_values)
mean_values_df <- as.data.frame(mean_values_df)

# Rename the columns
colnames(mean_values_df) <- c("Condition", "NOR", "RAD")

mean_values_df$NOR<-round(as.numeric(mean_values_df$NOR),4)
mean_values_df$RAD<-round(as.numeric(mean_values_df$RAD),4)

mean_values_df
##                      Condition    NOR    RAD
## 1                       B cell 0.0180 0.0144
## 2 Cancer associated fibroblast 0.0000 0.0000
## 3                  T cell CD4+ 0.8207 0.8171
## 4                  T cell CD8+ 0.0067 0.0095
## 5             Endothelial cell 0.0002 0.0002
## 6                   Macrophage 0.0801 0.0947
## 7                      NK cell 0.0635 0.0625
## 8         uncharacterized cell 0.0107 0.0014
pvals<-NULL
for (i in 1:nrow(res_epic)){
  pval<-wilcox.test(unlist(res_epic[i,-107])~condition_rad_norad)$p.val
  pvals<-c(pvals,pval)
}
res_epic$pvals<-pvals

res_epic[,c(1,107)]
##                              SG-17.CEL       pvals
## B cell                           0.017 0.375776374
## Cancer associated fibroblast     0.000 0.326527615
## T cell CD4+                      0.805 0.500951528
## T cell CD8+                      0.000 0.214802660
## Endothelial cell                 0.000 0.760690705
## Macrophage                       0.065 0.002765007
## NK cell                          0.047 0.914439299
## uncharacterized cell             0.066 0.071282267

Just the p-value of the Macrophage cell type is less than 0.05, which suggest that there are significant differences in the expression between radiated and non radiated samples. In the rest of the cell types, the p-value is higher than 0.05, which indicates that there are not significant differences between the mean of the cell types of radiated and no radiated samples.

7.2.1 T cell CD4 EPIC

Let’s prepare the T cell CD4 variable to perform a correlation analysis.

# Get T CD4+ cells expression levels. 
T_CD4_epic<-res_epic['T cell CD4+',-107]
T_CD4_epic<-t(T_CD4_epic)
T_CD4_epic<-as.data.frame(T_CD4_epic)
T_CD4_epic
##             T cell CD4+
## SG-17.CEL         0.805
## SG-18.CEL         0.515
## SG-19.CEL         0.921
## SG-20.CEL         0.927
## SG-21.CEL         0.877
## SG-22.CEL         0.859
## SG-23.CEL         0.796
## SG-24.CEL         0.795
## SG-25.CEL         0.829
## SG-26.CEL         0.813
## SG-27.CEL         0.830
## SG-28_2.CEL       0.932
## SG-31.CEL         0.641
## SG-32.CEL         0.707
## SG-33.CEL         0.743
## SG-34.CEL         0.863
## SG-35.CEL         0.764
## SG-36.CEL         0.833
## SG-37.CEL         0.854
## SG-38.CEL         0.833
## SG-39.CEL         0.856
## SG-40.CEL         0.852
## SG-41.CEL         0.879
## SG-42.CEL         0.837
## SG-43.CEL         0.732
## SG-44.CEL         0.732
## SG-45.CEL         0.838
## SG-46.CEL         0.836
## SG-47.CEL         0.893
## SG-48.CEL         0.875
## SG-49.CEL         0.866
## SG-50.CEL         0.848
## SG-51.CEL         0.885
## SG-52.CEL         0.852
## SG-53.CEL         0.872
## SG-54.CEL         0.853
## SG-55.CEL         0.888
## SG-56.CEL         0.892
## SG-57.CEL         0.751
## SG-58.CEL         0.748
## SG-59.CEL         0.861
## SG-60.CEL         0.863
## SG-63.CEL         0.807
## SG-64.CEL         0.803
## SG-65.CEL         0.888
## SG-66.CEL         0.878
## SG-67.CEL         0.748
## SG-68.CEL         0.708
## SG-69.CEL         0.870
## SG-70.CEL         0.872
## SG-71.CEL         0.831
## SG-72.CEL         0.829
## SG-73.CEL         0.816
## SG-74.CEL         0.807
## SG-75.CEL         0.881
## SG-76.CEL         0.852
## SG-77.CEL         0.877
## SG-78.CEL         0.856
## SG-79.CEL         0.817
## SG-80.CEL         0.823
## SG-81.CEL         0.854
## SG-82.CEL         0.812
## SG-83.CEL         0.867
## SG-84.CEL         0.847
## SG-85.CEL         0.746
## SG-86.CEL         0.749
## SG-87.CEL         0.815
## SG-88.CEL         0.808
## SG-89.CEL         0.887
## SG-90.CEL         0.862
## SG-91.CEL         0.885
## SG-92.CEL         0.892
## SG-93.CEL         0.760
## SG-94.CEL         0.736
## SG-95.CEL         0.784
## SG-96.CEL         0.796
## SG-97.CEL         0.836
## SG-98.CEL         0.806
## SG-99.CEL         0.854
## SG-100.CEL        0.834
## SG-101.CEL        0.798
## SG-102.CEL        0.671
## SG-103.CEL        0.790
## SG-104.CEL        0.789
## SG-105.CEL        0.850
## SG-106.CEL        0.797
## SG-107.CEL        0.437
## SG-108.CEL        0.767
## SG-109.CEL        0.834
## SG-110.CEL        0.884
## SG-111.CEL        0.899
## SG-112.CEL        0.901
## SG-113.CEL        0.776
## SG-114.CEL        0.789
## SG-115.CEL        0.793
## SG-116.CEL        0.775
## SG-117.CEL        0.782
## SG-118.CEL        0.798
## SG-119.CEL        0.837
## SG-120.CEL        0.827
## SG-121.CEL        0.753
## SG-122.CEL        0.760
## SG-123.CEL        0.850
## SG-124.CEL        0.837
## 127.CEL           0.893
## SG-128.CEL        0.877

7.3 MIXTURE

Finally, we will also perform deconvolution by using the MIXTURE method, based in cibersort.

library(MIXTURE)
res_mixt<-MIXTURE(2^exprs(brca.noduplicated),signatureMatrix = CIBERSORT_LM22)
## 
## MIXTURE version 1
## Provided signature matrix 547 genes and 22 cell types
## Overlapping genes : 532 (97.26%)
res_m<-as.data.frame(t(res_mixt$Subjects$MIXabs))

# remove cell types that do not have at least 5 unique values
distinct_values_count <- apply(res_m, 1, function(x) length(unique(x)))

# Subset the data frame to include rows with at least 5 distinct values
df_filtered <- res_m[distinct_values_count >= 5, ]


test<-apply(df_filtered,1,function(x) shapiro.test(x)$p.value)
test
##                B.cells.naive                  T.cells.CD8 
##                 1.689505e-07                 1.329368e-06 
##            T.cells.CD4.naive T.cells.CD4.memory.activated 
##                 1.693894e-05                 5.195785e-01 
##    T.cells.follicular.helper   T.cells.regulatory..Tregs. 
##                 1.301261e-03                 1.440611e-11 
##          T.cells.gamma.delta             NK.cells.resting 
##                 1.844063e-10                 3.696706e-12 
##           NK.cells.activated               Macrophages.M0 
##                 9.637498e-06                 3.672882e-10 
##               Macrophages.M1    Dendritic.cells.activated 
##                 8.352601e-04                 1.170704e-01 
##         Mast.cells.activated                  Eosinophils 
##                 7.408248e-04                 4.138107e-19 
##                  Neutrophils 
##                 1.655964e-20
pos<-which(test<0.05)

As most of the cell types do not follow a normal distribution, we will treat all the data as not normal distributed, and we will apply a Wilcoxon for testing paired data for testing the equality of 2 means.

7.3.1 Not normally distributed data cell types

# Separate radiated from non-radiated
table(pdata$Treatment)
## 
## NOR RAD 
##  53  53
# vector of conditions
condition_rad_norad=pdata$Treatment

mean_values<-NULL
for (i in 1: nrow(df_filtered)){
    mean<-tapply((unlist(df_filtered[i,])),condition_rad_norad, mean)
    rownam<-rownames(df_filtered[i,])
    mean_values<-c(mean_values,c(rownam,mean))
    }

# Function to format the mean_values into groups of 3
format_mean_values <- function(values) {
  result <- list()
  num_values <- length(values)
  i <- 1
  while (i <= num_values) {
    result <- c(result, list(values[i:(i + 2)]))
    i <- i + 3
  }
  return(result)
}

mean_values <- format_mean_values(mean_values)

# Convert mean_values to a data frame
mean_values_df <- do.call(rbind, mean_values)
mean_values_df <- as.data.frame(mean_values_df)

# Rename the columns
colnames(mean_values_df) <- c("Condition", "NOR", "RAD")

mean_values_df$NOR<-round(as.numeric(mean_values_df$NOR),4)
mean_values_df$RAD<-round(as.numeric(mean_values_df$RAD),4)

mean_values_df
##                       Condition    NOR    RAD
## 1                 B.cells.naive 0.0210 0.0208
## 2                   T.cells.CD8 0.0422 0.0556
## 3             T.cells.CD4.naive 0.0441 0.0514
## 4  T.cells.CD4.memory.activated 0.5582 0.4981
## 5     T.cells.follicular.helper 0.0610 0.0701
## 6    T.cells.regulatory..Tregs. 0.0249 0.0301
## 7           T.cells.gamma.delta 0.0192 0.0210
## 8              NK.cells.resting 0.0392 0.0408
## 9            NK.cells.activated 0.0980 0.0939
## 10               Macrophages.M0 0.0261 0.0511
## 11               Macrophages.M1 0.0723 0.0617
## 12    Dendritic.cells.activated 0.0471 0.0544
## 13         Mast.cells.activated 0.0256 0.0286
## 14                  Eosinophils 0.0013 0.0015
## 15                  Neutrophils 0.0003 0.0011

Apply statistical tests:

# --- radiated vs no-radiated --------------

pvals<-NULL
for (i in 1:nrow(df_filtered)){
  pval<-wilcox.test(unlist(df_filtered[i,])~condition_rad_norad)$p.val
  pvals<-c(pvals,pval)
}

df_filtered$pvals<-pvals

df<-df_filtered[,106:107]
df[-1]
##                                    pvals
## B.cells.naive                0.934313705
## T.cells.CD8                  0.137652805
## T.cells.CD4.naive            0.321457436
## T.cells.CD4.memory.activated 0.062322324
## T.cells.follicular.helper    0.184034239
## T.cells.regulatory..Tregs.   0.398475960
## T.cells.gamma.delta          0.693879550
## NK.cells.resting             0.723163293
## NK.cells.activated           0.745224335
## Macrophages.M0               0.007847483
## Macrophages.M1               0.026137853
## Dendritic.cells.activated    0.017965298
## Mast.cells.activated         0.197334746
## Eosinophils                  0.573571181
## Neutrophils                  0.217610148

Only the Macrophages.MO cell type has a p-value less than 0.05. The other cell types present a p-value higher than 0.05, which suggests that there are no statistically significant differences between the mean expression values of radiated and non radiated samples.

7.3.1.1 Total T CD4 cells

# Select T cells CD4 naive
T_CD4_naive_mixt<-df_filtered['T.cells.CD4.naive',-107]
T_CD4_naive_mixt<-as.data.frame(t(T_CD4_naive_mixt))


# Select T cells CD4 memory activated
T_CD4_memory_act_mixt<-df_filtered['T.cells.CD4.memory.activated',-107]
T_CD4_memory_act_mixt<-as.data.frame(t(T_CD4_memory_act_mixt))


# Select T CD4 cells 
TCD4_mixture<-df_filtered[c('T.cells.CD4.naive','T.cells.CD4.memory.activated'),-107]

#apply(TCD4_mixture,2,sum)
Total_T_CD4_mixture<-colSums(TCD4_mixture)
Total_T_CD4_mixture<-as.data.frame(Total_T_CD4_mixture)
Total_T_CD4_mixture
##             Total_T_CD4_mixture
## SG-17.CEL             0.7227444
## SG-18.CEL             0.6622090
## SG-19.CEL             0.7698298
## SG-20.CEL             0.6513247
## SG-21.CEL             0.6508326
## SG-22.CEL             0.6237699
## SG-23.CEL             0.6022028
## SG-24.CEL             0.4137894
## SG-25.CEL             0.5447275
## SG-26.CEL             0.5008375
## SG-27.CEL             0.8456932
## SG-28_2.CEL           0.9683742
## SG-31.CEL             0.5196936
## SG-32.CEL             0.5599829
## SG-33.CEL             0.8570890
## SG-34.CEL             0.9128199
## SG-35.CEL             0.7829372
## SG-36.CEL             0.8605800
## SG-37.CEL             0.5370510
## SG-38.CEL             0.4743053
## SG-39.CEL             0.7764444
## SG-40.CEL             0.8940292
## SG-41.CEL             0.9279351
## SG-42.CEL             0.7407673
## SG-43.CEL             0.4263578
## SG-44.CEL             0.4534880
## SG-45.CEL             0.8255637
## SG-46.CEL             0.8222252
## SG-47.CEL             0.7582386
## SG-48.CEL             0.6304863
## SG-49.CEL             0.7570128
## SG-50.CEL             0.6607535
## SG-51.CEL             0.7208886
## SG-52.CEL             0.5750902
## SG-53.CEL             0.8973038
## SG-54.CEL             0.7679818
## SG-55.CEL             0.7508349
## SG-56.CEL             0.8168288
## SG-57.CEL             0.4326015
## SG-58.CEL             0.3791665
## SG-59.CEL             0.6898237
## SG-60.CEL             0.6421529
## SG-63.CEL             0.5944003
## SG-64.CEL             0.4458255
## SG-65.CEL             0.7650210
## SG-66.CEL             0.5664822
## SG-67.CEL             0.3107340
## SG-68.CEL             0.3441330
## SG-69.CEL             0.5558185
## SG-70.CEL             0.7021891
## SG-71.CEL             0.5353809
## SG-72.CEL             0.6318807
## SG-73.CEL             0.4935458
## SG-74.CEL             0.2966467
## SG-75.CEL             0.6706202
## SG-76.CEL             0.4744458
## SG-77.CEL             0.5588433
## SG-78.CEL             0.4353342
## SG-79.CEL             0.5222639
## SG-80.CEL             0.3707288
## SG-81.CEL             0.4984131
## SG-82.CEL             0.4079856
## SG-83.CEL             0.7985089
## SG-84.CEL             0.7065148
## SG-85.CEL             0.3714197
## SG-86.CEL             0.2907388
## SG-87.CEL             0.5736820
## SG-88.CEL             0.5763532
## SG-89.CEL             0.7031914
## SG-90.CEL             0.5467856
## SG-91.CEL             0.3615526
## SG-92.CEL             0.3174581
## SG-93.CEL             0.2810744
## SG-94.CEL             0.2680761
## SG-95.CEL             0.6506295
## SG-96.CEL             0.4638009
## SG-97.CEL             0.5577679
## SG-98.CEL             0.5294182
## SG-99.CEL             0.5903626
## SG-100.CEL            0.4563102
## SG-101.CEL            0.2304045
## SG-102.CEL            0.1702647
## SG-103.CEL            0.3263064
## SG-104.CEL            0.3107224
## SG-105.CEL            0.3434178
## SG-106.CEL            0.3481191
## SG-107.CEL            0.2428568
## SG-108.CEL            0.3490943
## SG-109.CEL            0.8219128
## SG-110.CEL            0.6360979
## SG-111.CEL            0.6381892
## SG-112.CEL            0.5678088
## SG-113.CEL            0.6333835
## SG-114.CEL            0.5859477
## SG-115.CEL            0.5761211
## SG-116.CEL            0.5370023
## SG-117.CEL            0.4424124
## SG-118.CEL            0.3273496
## SG-119.CEL            0.6240447
## SG-120.CEL            0.5905892
## SG-121.CEL            0.4614982
## SG-122.CEL            0.5945873
## SG-123.CEL            0.5243628
## SG-124.CEL            0.4238826
## 127.CEL               0.8678341
## SG-128.CEL            0.8383384

In conclusion from the deconvolution analysis, the proportion of T CD4 cell type in all the 3 methods is very high, and there is no significant differences between the mean expression values of this cell types between radiated and non radiated samples.

7.4 CORRELATIONS

In this section, correlation analysis between the 3 deconvolution methods used will be performed. The goal is to try to identify which method will be used to obtain the TCD4 cell type covariate to include it in the variable selection with LASSO step.

7.4.1 Correlation EPIC-MIXTURE

library(ggplot2)
library(ggpubr)
T_cellcd4_mixt_epic<-cbind(Total_T_CD4_mixture,T_CD4_epic)
colnames(T_cellcd4_mixt_epic)<-c('TCD4_MIXTURE','TCD4_EPIC')

cor.test(T_cellcd4_mixt_epic$TCD4_MIXTURE,T_cellcd4_mixt_epic$TCD4_EPIC,method = 'spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  T_cellcd4_mixt_epic$TCD4_MIXTURE and T_cellcd4_mixt_epic$TCD4_EPIC
## S = 91523, p-value = 2.534e-09
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.5388943
ggplot(data=T_cellcd4_mixt_epic,aes(x = TCD4_MIXTURE, y = TCD4_EPIC)) + geom_point(color='black')+stat_smooth(method = 'lm') + 
  ggtitle('Correlation TCD4 EPIC and TCD4 MIXTURE') +xlab('TCD4 MIXTURE') +ylab('TCD4 EPIC')

7.4.2 Correlation CIBERSORT-MIXTURE

T_cellcd4_ciber_mixt<-cbind(Total_T_CD4_ciber,Total_T_CD4_mixture)
colnames(T_cellcd4_ciber_mixt)<-c('TCD4_CIBERSORT_abs','TCD4_MIXTURE')

cor.test(T_cellcd4_ciber_mixt$TCD4_CIBERSORT_abs,T_cellcd4_ciber_mixt$TCD4_MIXTURE,method = 'spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  T_cellcd4_ciber_mixt$TCD4_CIBERSORT_abs and T_cellcd4_ciber_mixt$TCD4_MIXTURE
## S = 21266, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.8928596
ggplot(data=T_cellcd4_ciber_mixt,aes(x = TCD4_CIBERSORT_abs, y = TCD4_MIXTURE)) + geom_point(color='black')+stat_smooth(method = 'lm') + 
  ggtitle('Correlation TCD4 CIBERSORT abs mode and TCD4 MIXTURE') +xlab('TCD4 CIBERSORT abs mode') +ylab('TCD4 MIXTURE')
## `geom_smooth()` using formula = 'y ~ x'

As both Cibersort and MIXTURE present detailed TCD4 cell types, we will assess the correlation between TCD4 naive and memory activated cell types.

7.4.2.1 CD4 NAIVE

T_cd4_naive_both<-cbind(T_CD4_naive_ciber,T_CD4_naive_mixt)
colnames(T_cd4_naive_both)<-c('naive_CIBERSORT_abs','naive_MIXT')

cor.test(T_cd4_naive_both$naive_CIBERSORT_abs,T_cd4_naive_both$naive_MIXT,method = 'spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  T_cd4_naive_both$naive_CIBERSORT_abs and T_cd4_naive_both$naive_MIXT
## S = 50456, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.7457927
ggplot(data=T_cd4_naive_both,aes(x = naive_CIBERSORT_abs, y = naive_MIXT)) + geom_point(color='black')+stat_smooth(method = 'lm') + 
  ggtitle('Correlation TCD4 naive CIBERSORT abs mode and TCD4 naive MIXTURE') +xlab('TCD4 naive CIBERSORT abs mode') +ylab('TCD4 naive MIXTURE')
## `geom_smooth()` using formula = 'y ~ x'

7.4.2.2 CD4 MEMORY ACTIVATED

T_cd4_MEMORY_both<-cbind(T_CD4_memory_act_ciber,T_CD4_memory_act_mixt)
colnames(T_cd4_MEMORY_both)<-c('CIBERSORT_abs','MIXT')

cor.test(T_cd4_MEMORY_both$CIBERSORT_abs,T_cd4_MEMORY_both$MIXT,method = 'spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  T_cd4_MEMORY_both$CIBERSORT_abs and T_cd4_MEMORY_both$MIXT
## S = 23082, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##      rho 
## 0.883709
ggplot(data=T_cd4_MEMORY_both,aes(x = CIBERSORT_abs, y = MIXT)) + geom_point(color='black')+stat_smooth(method = 'lm') + 
  ggtitle('Correlation TCD4 memory activated CIBERSORT and  MIXTURE') +xlab('TCD4 memory activated CIBERSORT abs mode') +ylab('TCD4 memory activated MIXTURE')
## `geom_smooth()` using formula = 'y ~ x'

7.4.3 Correlation CIBERSORT-EPIC

T_cellcd4_ciber_epic<-cbind(Total_T_CD4_ciber,T_CD4_epic)
colnames(T_cellcd4_ciber_epic)<-c('TCD4_CIBERSORT_abs','TCD4_EPIC')

cor.test(T_cellcd4_ciber_epic$TCD4_CIBERSORT_abs,T_cellcd4_ciber_epic$TCD4_EPIC,method = 'spearman')
## 
##  Spearman's rank correlation rho
## 
## data:  T_cellcd4_ciber_epic$TCD4_CIBERSORT_abs and T_cellcd4_ciber_epic$TCD4_EPIC
## S = 120965, p-value = 3.488e-05
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##     rho 
## 0.39056
ggplot(data=T_cellcd4_ciber_epic,aes(x = TCD4_CIBERSORT_abs, y = TCD4_EPIC)) + geom_point(color='black')+stat_smooth(method = 'lm') + 
  ggtitle('Correlation TCD4 CIBERSORT abs mode and TCD4 EPIC') +xlab('TCD4 CIBERSORT') +ylab('TCD4 EPIC')
## `geom_smooth()` using formula = 'y ~ x'

In conclusion, we can see that Cibersort abs.mode and MIXTURE are the 2 methods that are highly correlated, which is what we expected as MIXTURE is based in cibersort. In the other hand, we can see that the correlation is not strong when using EPIC, as it tends to overestimate the fraction of cell types. For this reason, EPIC will be discarded. Then, we have no evidence that MIXTURE provides an score that allow to perform both between cell and between sample comparisons, so for this reason, we will use CIBERSORT abs. mode as the method to obtain the TCD4 cell type expression and to use it as a covariate in LASSO and in Differential expression analysis steps.

8 Subset no-radiated samples

From now on, we will focus on the analysis of no-radiated samples.

noradiated<-rownames(pdata[pdata$Treatment=='NOR',])
length(noradiated)
## [1] 53
brca.noduplicated.norad<-brca.noduplicated[,noradiated]
dim(brca.noduplicated.norad)
## Features  Samples 
##    21923       53

9 Filtering

There are different approaches to filter data, avoiding unexpressed genes or genes that do not vary among different conditions. In this case, we will filter by standard deviation.

head(exprs(brca.noduplicated.norad[grep('BRCA|MYC',rownames(exprs(brca.noduplicated.norad))),]))[,1:5]
##            SG-17.CEL SG-19.CEL SG-21.CEL SG-23.CEL SG-25.CEL
## BRCA1       6.514293  6.023955  5.892121  5.763817  5.100376
## BRCA2       4.538271  4.060444  4.806153  4.066198  3.941448
## GJA9-MYCBP  8.116303  7.778913  7.750568  7.728113  7.574291
## MYC        10.861189 10.551826 10.417730 10.268605  9.589980
## MYCBP2      9.143891  9.407169  9.566963  8.806109  9.453326
## MYCBPAP     2.799831  2.873665  2.856664  3.190349  3.028650

Before filtering we have the presence of both BRCA1 and BRCA2 genes. BRCA1 expression levels are a little bit higher than BRCA2. Then, there are 4 genes related to the MYC family, being the MYC with the highest expression gene.

Different filtering criteria was applied (standard deviation 0.75, 0.5 and no filtering). 0.75 resulted to be too stringent, so we kept with the standard deviation 0.5 filtering criteria.

sd <- apply(exprs(brca.noduplicated.norad),1,sd)
summary(sd)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0451  0.1510  0.1922  0.2259  0.2538  1.9913
threshold<-quantile(sd,0.5)

top.pos<-which(sd>threshold)
brca.rma.norad_filt<-brca.noduplicated.norad[top.pos,]

dim(brca.rma.norad_filt)
## Features  Samples 
##    10961       53
pData(brca.rma.norad_filt)<-pdata[noradiated,]

After filtering, we keep with 10961 probe sets.

brca<-exprs(brca.rma.norad_filt)
exprs(brca.rma.norad_filt[grep('BRCA|MYC',rownames(exprs(brca.rma.norad_filt))),])[,1:5]
##            SG-17.CEL SG-19.CEL SG-21.CEL SG-23.CEL SG-25.CEL
## BRCA1       6.514293  6.023955  5.892121  5.763817  5.100376
## BRCA2       4.538271  4.060444  4.806153  4.066198  3.941448
## GJA9-MYCBP  8.116303  7.778913  7.750568  7.728113  7.574291
## MYC        10.861189 10.551826 10.417730 10.268605  9.589980
## MYCNUT      3.689155  3.876113  3.894650  4.142342  4.194127
rownames(brca)[1]<-'TCONS_00029157'
brca<-brca[order(rownames(brca)),]

brca<-ExpressionSet(assayData = brca)

#rownames(pdata)<-gsub('-','.',rownames(pdata))
#rownames(pdata)<-gsub('127.CEL','X127.CEL',rownames(pdata))

pData(brca)<-pdata[noradiated,]
colnames(brca)
##  [1] "SG-17.CEL"  "SG-19.CEL"  "SG-21.CEL"  "SG-23.CEL"  "SG-25.CEL" 
##  [6] "SG-27.CEL"  "SG-31.CEL"  "SG-33.CEL"  "SG-35.CEL"  "SG-37.CEL" 
## [11] "SG-39.CEL"  "SG-41.CEL"  "SG-43.CEL"  "SG-45.CEL"  "SG-47.CEL" 
## [16] "SG-49.CEL"  "SG-51.CEL"  "SG-53.CEL"  "SG-55.CEL"  "SG-57.CEL" 
## [21] "SG-59.CEL"  "SG-63.CEL"  "SG-65.CEL"  "SG-67.CEL"  "SG-69.CEL" 
## [26] "SG-71.CEL"  "SG-73.CEL"  "SG-75.CEL"  "SG-77.CEL"  "SG-79.CEL" 
## [31] "SG-81.CEL"  "SG-83.CEL"  "SG-85.CEL"  "SG-87.CEL"  "SG-89.CEL" 
## [36] "SG-91.CEL"  "SG-93.CEL"  "SG-95.CEL"  "SG-97.CEL"  "SG-99.CEL" 
## [41] "SG-101.CEL" "SG-103.CEL" "SG-105.CEL" "SG-107.CEL" "SG-109.CEL"
## [46] "SG-111.CEL" "SG-113.CEL" "SG-115.CEL" "SG-117.CEL" "SG-119.CEL"
## [51] "SG-121.CEL" "SG-123.CEL" "127.CEL"
head(exprs(brca))[,1:5]
##         SG-17.CEL SG-19.CEL SG-21.CEL SG-23.CEL SG-25.CEL
## A1BG     5.444171  5.829718  5.465967  5.318205  5.139036
## A2M-AS1  2.781938  4.062422  3.625387  2.741535  4.308031
## A2MP1    2.549829  3.056435  3.075571  3.209491  3.058632
## AA06     3.212490  3.478477  3.623134  3.723501  3.904911
## AAAS     7.370134  7.127275  7.282128  6.933398  6.716671
## AAED1    7.976834  7.914848  8.089985  8.263372  8.318438

BRCA1 and BCRA2 both pass the filtering step and 3 genes from the MYC family, which means that the standard deviation of the expression levels of these genes is over the 2nd quantile of standard deviation.

10 Variable selection binomial

This approach takes into account the differences in predictor effects across the comparisons. It allows to identify the specific predictor variables that are associated with each comparison. In this case, it’s reasonable to expect that certain predictor variables may not have an effect in all the comparisons. For example, if a predictor variable represents chemotherapy treatment, it may only be relevant in comparisons involving individuals who have undergone chemotherapy.

Therefore, using a binomial approach for variable selection will allow to capture these specific associations and identify predictor variables that are relevant to each comparison separately, so we will get the variables that are better adjusted with the phenotype, in this case. This can provide more accurate and meaningful results for your analysis.

We want to study the next comparisons:

  • BRCA1.AF vs NOMUT.AF

  • BRCA1.SA VS NOMUT.SA

  • BRCA2.AF VS NOMUT.AF

  • BRCA2.SA VS NOMUT.SA

# ---  Load the data ---
library(readxl)
Patient_info <- as.data.frame(read_excel("../Patient info.xlsx"))

# --- Prepare it ---
# Change the row names for their array code identification
rownames(Patient_info)<-Patient_info$`ARRAY CODE`
rownames(Patient_info)<-gsub('LL','L',rownames(Patient_info))
rownames(Patient_info)<-gsub('SG-127.CEL','X127.CEL',rownames(Patient_info))
rownames(Patient_info)<-gsub('SG-13.CEL','SG-103.CEL',rownames(Patient_info))
rownames(Patient_info)<-gsub('SG-14.CEL','SG-104.CEL',rownames(Patient_info))
rownames(Patient_info)<-gsub('-','.',rownames(Patient_info))


#Rename smoking variable of patients
Patient_info$`SMOKING HISTORY`[Patient_info$`SMOKING HISTORY`=='EX']<-'SI'
Patient_info$`SMOKING HISTORY`[Patient_info$`SMOKING HISTORY`=='Yes']<-'SI'

Patient_info$`YEARS AFTER RADIO`[is.na(Patient_info$`YEARS AFTER RADIO`)]<-0
Patient_info$`YEARS AFTER QUIMIO`[is.na(Patient_info$`YEARS AFTER QUIMIO`)]<-0

# Get only no-radiated samples
noradiated<-gsub('-','.',noradiated)
noradiated<-gsub('127.CEL','X127.CEL',noradiated)
Patient_info<-Patient_info[noradiated,]

# Remove NA patients from smoking history
Patients_smoking<-Patient_info[!is.na(Patient_info$`SMOKING HISTORY`),]
smoking_NA<-Patient_info[is.na(Patient_info$`SMOKING HISTORY`),]
dim(smoking_NA) # 7 patients have NA for smoking history
## [1]  7 20
colnames(brca)<-gsub('-','.',colnames(brca))
colnames(brca)<-gsub('127.CEL','X127.CEL',colnames(brca))
smoking_samples<-intersect(colnames(brca),rownames(Patients_smoking))

#Select only the patients with smoking information from the filtered and normalized object
brca_smoking<-brca[,smoking_samples]

#Transform to factor
Patients_smoking$`SMOKING HISTORY`<-as.factor(Patients_smoking$`SMOKING HISTORY`)
Patients_smoking$RADIO<-as.factor(Patients_smoking$RADIO)
Patients_smoking$QUIMIO<-as.factor(Patients_smoking$QUIMIO)
#str(Patients_smoking)

#Add phenotype
Patients_smoking$Phenotype<-pData(brca_smoking)$Phenotype
Patients_smoking$Phenotype<-as.factor(Patients_smoking$Phenotype)

Check the column names and the row names are in the same order:

identical(rownames(Patients_smoking),colnames(brca_smoking))
## [1] TRUE
Patients_smoking$Phenotype<-pData(brca_smoking)$Phenotype

table(Patients_smoking$Phenotype)
## 
## BRCA1.AF BRCA1.SA BRCA2.AF BRCA2.SA NOMUT.AF NOMUT.SA 
##        8        8       10        6        4       10

10.1 Data summary

Summary of the important variables:

library(tableone)

factor(Patient_info$CARRIER)
##  [1] BRCA2     BRCA2     NUMUT     BRCA2     BRCA1     BRCA1     BRCA2    
##  [8] BRCA2     BRCA2     BRCA1     BRCA2     BRCA2     BRCA1     BRCA2    
## [15] BRCA2     BRCA1     BRCA2     BRCA2     BRCA2     BRCA2     BRCA1    
## [22] BRCA2     BRCA2     BRCA2     NUMUT     NUMUT     NUMUT     NUMUT    
## [29] NUMUT     NUMUT     BRCA2     BRCA2     BRCA1     BRCA2     BRCA1    
## [36] NUMUT     BRCA1     BRCA1     BRCA1     BRCA1     NUMUT     NUMUT    
## [43] BRCA1     BRCA1     BRCA1     BRCA1     BRCA1     CAN-ESPOR CAN-ESPOR
## [50] CAN-ESPOR CAN-ESPOR CAN-ESPOR BRCA1    
## Levels: BRCA1 BRCA2 CAN-ESPOR NUMUT
#Rename smoking variable of patients
Patient_info$CARRIER[Patient_info$CARRIER=='NUMUT']<-'NUMUT'
Patient_info$CARRIER[Patient_info$CARRIER=='CAN-ESPOR']<-'NUMUT'

Patients_smoking$CARRIER[Patients_smoking$CARRIER=='NUMUT']<-'NUMUT'
Patients_smoking$CARRIER[Patients_smoking$CARRIER=='CAN-ESPOR']<-'NUMUT'


tableOne<-CreateTableOne(data=Patient_info[,c(6,7,8,15,16,17,18,19,20)])
tableOne_smoking<-CreateTableOne(data=Patients_smoking[,c(6,7,8,15,16,17,18,19,20)])
summary(tableOne)
## 
##      ### Summary of continuous variables ###
## 
## strata: Overall
##                       n miss p.miss mean sd median p25 p75 min max skew kurt
## AGE, EXTRACTION TIME 53    0      0   45 12     46  36  55  19  71 -0.1 -0.7
## YEARS AFTER QUIMIO   53    0      0    5  7      0   0   7   0  28  1.7  2.5
## YEARS AFTER RADIO    53    0      0    5  7      0   0   7   0  27  1.7  2.3
## YEARS OF SMOKING     53   40     75   24 10     20  20  27  12  41  0.8 -0.4
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## strata: Overall
##               var  n miss p.miss    level freq percent cum.percent
##           CARRIER 53    0    0.0    BRCA1   18    34.0        34.0
##                                     BRCA2   20    37.7        71.7
##                                     NUMUT   15    28.3       100.0
##                                                                   
##  HEALTH CONDITION 53    0    0.0 AFFECTED   26    49.1        49.1
##                                   HEALTHY   27    50.9       100.0
##                                                                   
##            QUIMIO 53    0    0.0       NO   30    56.6        56.6
##                                        SI   23    43.4       100.0
##                                                                   
##             RADIO 53    0    0.0       NO   29    54.7        54.7
##                                        SI   24    45.3       100.0
##                                                                   
##   SMOKING HISTORY 53    7   13.2       NO   21    45.7        45.7
##                                        SI   25    54.3       100.0
## 
summary(tableOne_smoking)
## 
##      ### Summary of continuous variables ###
## 
## strata: Overall
##                       n miss p.miss mean sd median p25 p75 min max  skew kurt
## AGE, EXTRACTION TIME 46    0      0   46 12     46  37  55  19  71 -0.07 -0.6
## YEARS AFTER QUIMIO   46    0      0    5  8      0   0   7   0  28  1.74  2.3
## YEARS AFTER RADIO    46    0      0    4  7      0   0   7   0  27  1.75  2.4
## YEARS OF SMOKING     46   33     72   24 10     20  20  27  12  41  0.78 -0.4
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## strata: Overall
##               var  n miss p.miss    level freq percent cum.percent
##           CARRIER 46    0    0.0    BRCA1   16    34.8        34.8
##                                     BRCA2   16    34.8        69.6
##                                     NUMUT   14    30.4       100.0
##                                                                   
##  HEALTH CONDITION 46    0    0.0 AFFECTED   22    47.8        47.8
##                                   HEALTHY   24    52.2       100.0
##                                                                   
##            QUIMIO 46    0    0.0       NO   26    56.5        56.5
##                                        SI   20    43.5       100.0
##                                                                   
##             RADIO 46    0    0.0       NO   26    56.5        56.5
##                                        SI   20    43.5       100.0
##                                                                   
##   SMOKING HISTORY 46    0    0.0       NO   21    45.7        45.7
##                                        SI   25    54.3       100.0
## 
#print(tableOne, showAllLevels = TRUE, formatOptions = list(big.mark = ","))

10.2 BRCA1

10.2.1 - BRCA1.SA vs NOMUT.SA

Only 3 variables to consider: age, smoking and cibersort.

# --- Select the patients that have this condition ---
Patients_brca1.sa<-Patients_smoking[Patients_smoking$Phenotype=='BRCA1.SA'| Patients_smoking$Phenotype=='NOMUT.SA',]
brca_smk_brca1.sa<-brca[,rownames(Patients_brca1.sa)]
#str(Patients_brca1.sa)

# --- Variable selection ---
ybrca1.sa<-as.factor(Patients_brca1.sa$Phenotype)
xbrca1.sa<-Patients_brca1.sa[,c('AGE, EXTRACTION TIME','SMOKING HISTORY')]

Total_T_CD4_cibersort<-data.frame(Total_T_CD4_ciber)
Total_T_CD4_cibersort_brca1<-as.data.frame(Total_T_CD4_ciber[intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca1.sa)),])
rownames(Total_T_CD4_cibersort_brca1)<-intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca1.sa))
colnames(Total_T_CD4_cibersort_brca1)<-'Total_T_CD4_ciber'

x <- model.matrix(ybrca1.sa~xbrca1.sa$`AGE, EXTRACTION TIME`+ xbrca1.sa$`SMOKING HISTORY`+Total_T_CD4_cibersort_brca1$Total_T_CD4_ciber-1)
x<-x[,-2]

# Fit the model
lasso_model <- cv.glmnet(x, ybrca1.sa, family = "binomial", standardize=TRUE)

# Select the optimal lambda value based on cross-validation
optimal_lambda <- lasso_model$lambda.min

coef(lasso_model,s=optimal_lambda)
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                                                      s1
## (Intercept)                                   0.2231436
## xbrca1.sa$`AGE, EXTRACTION TIME`              .        
## xbrca1.sa$`SMOKING HISTORY`SI                 .        
## Total_T_CD4_cibersort_brca1$Total_T_CD4_ciber .

The model has shrinked to 0 all the coefficients for the 3 variables we were considering, so in this case, LASSO suggests that no variables have an impact to the phenotype.

10.2.2 - BRCA1.AF vs NOMUT.AF

Now, as we are selecting cancer affected patients, we have to consider other variables such as chemotherapy, radiotherapy, and the years that have passed after the patient underwent these treatments.

Patients_smoking$QUIMIO<-as.factor(Patients_smoking$QUIMIO)
Patients_smoking$RADIO<-as.factor(Patients_smoking$RADIO)
Patients_smoking$`SMOKING HISTORY`<-as.factor(Patients_smoking$`SMOKING HISTORY`)

# --- Select the patients that have this condition ---
Patients_brca1.af<-Patients_smoking[Patients_smoking$Phenotype=='BRCA1.AF'| Patients_smoking$Phenotype=='NOMUT.AF',]
brca_smk_brca1.af<-brca[,rownames(Patients_brca1.af)]

# --- Variable selection ---
ybrca1.af<-as.factor(Patients_brca1.af$Phenotype)
xbrca1.af<-Patients_brca1.af[,c('AGE, EXTRACTION TIME','QUIMIO','RADIO','YEARS AFTER QUIMIO','YEARS AFTER RADIO','SMOKING HISTORY')]

rownames(Patients_brca1.af)<-gsub('-','.',rownames(Patients_brca1.af))

Total_T_CD4_cibersort_BRCA.AF<-as.data.frame(Total_T_CD4_ciber[intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca1.af)),])
rownames(Total_T_CD4_cibersort_BRCA.AF)<-intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca1.af))
colnames(Total_T_CD4_cibersort_BRCA.AF)<-'Total_T_CD4_ciber'


x <- model.matrix(ybrca1.af~xbrca1.af$`AGE, EXTRACTION TIME`+xbrca1.af$QUIMIO+xbrca1.af$RADIO+xbrca1.af$`YEARS AFTER QUIMIO`+xbrca1.af$`YEARS AFTER RADIO`+xbrca1.af$`SMOKING HISTORY`+Total_T_CD4_cibersort_BRCA.AF$Total_T_CD4_ciber-1)
x<-x[,-2]

# Fit the model
lasso_model <- cv.glmnet(x, ybrca1.af, family = "binomial", standardize=TRUE)

# Select the optimal lambda value based on cross-validation
optimal_lambda <- lasso_model$lambda.min

coef(lasso_model,s=optimal_lambda)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                                                          s1
## (Intercept)                                     -6.78061315
## xbrca1.af$`AGE, EXTRACTION TIME`                 0.08007821
## xbrca1.af$QUIMIOSI                               .         
## xbrca1.af$RADIOSI                                .         
## xbrca1.af$`YEARS AFTER QUIMIO`                   .         
## xbrca1.af$`YEARS AFTER RADIO`                    .         
## xbrca1.af$`SMOKING HISTORY`SI                    2.44774203
## Total_T_CD4_cibersort_BRCA.AF$Total_T_CD4_ciber  .

In this case, LASSO indicates that age and smoking have an impact the the phenotype, so this variables will be added to the Limma model.

As we will compare within BRCA1 carriers, the two models, BRCA1.healthy vs NOMUT.healthy and BRCA1.affected and NOMUT.affected need to have the same variables. For this reason, when performing the differential expression analysis, we will add smoking and age as covariates for both of the models (healthy and affected).

10.3 BRCA2

We will repeat the same process for BRCA2.

10.3.1 - BRCA2.SA VS NOMUT.SA

# --- Select the patients that have this condition ---
Patients_brca2.sa<-Patients_smoking[Patients_smoking$Phenotype=='BRCA2.SA'| Patients_smoking$Phenotype=='NOMUT.SA',]
brca_smk_brca2.sa<-brca[,rownames(Patients_brca2.sa)]


# --- Variable selection ---
ybrca2.sa<-as.factor(Patients_brca2.sa$Phenotype)
xbrca2.sa<-Patients_brca2.sa[,c('AGE, EXTRACTION TIME','SMOKING HISTORY')]


Total_T_CD4_ciber_brca2.sa<-as.data.frame(Total_T_CD4_ciber[intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca2.sa)),])
rownames(Total_T_CD4_ciber_brca2.sa)<-intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca2.sa))
colnames(Total_T_CD4_ciber_brca2.sa)<-'Total_T_CD4_ciber'


x <- model.matrix(ybrca2.sa~xbrca2.sa$`AGE, EXTRACTION TIME`+ xbrca2.sa$`SMOKING HISTORY`+Total_T_CD4_ciber_brca2.sa$Total_T_CD4_ciber-1)
x<-x[,-2]

# Fit the model
lasso_model <- cv.glmnet(x, ybrca2.sa, family = "binomial", standardize=TRUE)

# Select the optimal lambda value based on cross-validation
optimal_lambda <- lasso_model$lambda.min

coef(lasso_model,s=optimal_lambda)
## 4 x 1 sparse Matrix of class "dgCMatrix"
##                                                      s1
## (Intercept)                                   3.6071993
## xbrca2.sa$`AGE, EXTRACTION TIME`              0.1170684
## xbrca2.sa$`SMOKING HISTORY`SI                -2.6113084
## Total_T_CD4_ciber_brca2.sa$Total_T_CD4_ciber -7.7970161

In this case, all the variables that were possible to consider are important for the model, as LASSO do not shrink its coefficient to 0.

10.3.2 - BRCA2.AF VS NOMUT.AF

# --- Select the patients that have this condition ---
Patients_brca2.af<-Patients_smoking[Patients_smoking$Phenotype=='BRCA2.AF'| Patients_smoking$Phenotype=='NOMUT.AF',]
brca_smk_brca2.af<-brca[,rownames(Patients_brca2.af)]

# --- Variable selection ---
ybrca2.af<-as.factor(Patients_brca2.af$Phenotype)
xbrca2.af<-Patients_brca2.af[,c('AGE, EXTRACTION TIME','QUIMIO','RADIO','YEARS AFTER QUIMIO','YEARS AFTER RADIO','SMOKING HISTORY')]


rownames(Patients_brca2.af)<-gsub('-','.',rownames(Patients_brca2.af))

Total_T_CD4_ciber_brca2.af<-as.data.frame(Total_T_CD4_ciber[intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca2.af)),])
rownames(Total_T_CD4_ciber_brca2.af)<-intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca2.af))
colnames(Total_T_CD4_ciber_brca2.af)<-'Total_T_CD4_ciber'

x <- model.matrix(ybrca2.af~xbrca2.af$`AGE, EXTRACTION TIME`+xbrca2.af$QUIMIO+xbrca2.af$RADIO+xbrca2.af$`YEARS AFTER QUIMIO`+xbrca2.af$`YEARS AFTER RADIO`+xbrca2.af$`SMOKING HISTORY` +Total_T_CD4_ciber_brca2.af$Total_T_CD4_ciber -1)
x<-x[,-2]

# Fit the model
lasso_model <- cv.glmnet(x, ybrca2.af, family = "binomial", standardize=TRUE)

# Select the optimal lambda value based on cross-validation
optimal_lambda <- lasso_model$lambda.min

coef(lasso_model,s=optimal_lambda)
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                                                         s1
## (Intercept)                                  -9.381306e+00
## xbrca2.af$`AGE, EXTRACTION TIME`              8.572613e-02
## xbrca2.af$QUIMIOSI                            1.625722e+00
## xbrca2.af$RADIOSI                             2.608196e-15
## xbrca2.af$`YEARS AFTER QUIMIO`                .           
## xbrca2.af$`YEARS AFTER RADIO`                 .           
## xbrca2.af$`SMOKING HISTORY`SI                 3.076698e+00
## Total_T_CD4_ciber_brca2.af$Total_T_CD4_ciber  .

Finally, for this comparison, LASSO selects age, quimio, radio, and smoking.

Again, we should have the same model (same covariates) within the BRCA2 models to compare between them. That is, in the BRCA2 healthy comparison, we will add all the 3 variables LASSO returned, but in BRCA2 affected comparisons, we will also add CIBERSORT, although LASSO do not select it. For the affected samples comparison, we will also add chemotherapy and radiotherapy, although it is exclusively for affected patients, as healthy do not undergo cancer treatments. Despite this fact, we will consider that the models are identical, and we will compare between them.

11 Differential expression analysis

In this section, we will perform the differential expression analysis with LIMMA, and adding the covariates LASSO returned for each comparison. Those significant differential expressed genes will be those that whose adjusted p-value is less than 0.05, and its absolute logFC value is over 1.

11.1 BRCA1.SA vs NOMUT.SA

# --- Select the patients that have this condition ---
Patients_brca1_sa<-Patients_smoking[Patients_smoking$Phenotype=='BRCA1.SA'| Patients_smoking$Phenotype=='NOMUT.SA',]
brca_smk_sa1<-brca[,rownames(Patients_brca1_sa)]

#Check the order of the colnames of expression matrix and rownames of phenodata is the same
identical(colnames(brca_smk_sa1),rownames(Patients_brca1_sa))
## [1] TRUE
age<-Patients_brca1_sa$`AGE, EXTRACTION TIME`
smok<-Patients_brca1_sa$`SMOKING HISTORY`
cond<-as.factor(pData(brca_smk_sa1)$Phenotype) #Phenotype

#arrayweights
arrayw <- arrayWeights(brca_smk_sa1)
# barplot(arrayw, xlab="Array", ylab="Weight", col="white", las=2)
# abline(h=1, lwd=1, lty=2)


# Design a matrix
design <- model.matrix(~0+cond+age+smok)
rownames(design) <- sampleNames(brca_smk_sa1)
colnames(design) <- gsub("cond", "", colnames(design))

# Fit the model 
fit <- lmFit(brca_smk_sa1, design,weights = arrayw) 


# --- Contrasts ---
contrast.matrix <- makeContrasts(cond1=BRCA1.SA-NOMUT.SA,
                                 levels = design)

# --- Fit contrasts and Bayesian adjustment ---
fit2 <- contrasts.fit(fit, contrast.matrix)
fite <- eBayes(fit2)


# --- Extract results ---
# BRCA1.SA vs NOMUT.SA
top.table.brca1.sa <- topTable(fite, coef = 'cond1', number = Inf, adjust = "fdr")
head(top.table.brca1.sa)
##                logFC  AveExpr         t      P.Value adj.P.Val           B
## GPNMB     -0.8623314 3.882991 -5.912737 9.602999e-06 0.1052585  1.96824089
## LOC728554  0.6068391 8.708566  4.749191 1.296342e-04 0.4241059  0.33256269
## IFNGR2    -0.4413039 9.134212 -4.707440 1.426437e-04 0.4241059  0.26986132
## CLEC4D    -0.5879819 5.012083 -4.532115 2.133851e-04 0.4241059  0.00396882
## ITGAV     -0.8001331 8.919023 -4.437423 2.654141e-04 0.4241059 -0.14128143
## TMEM51    -0.5891220 5.409204 -4.387578 2.977641e-04 0.4241059 -0.21817184
# --- Distribution of p-values ---
hist(top.table.brca1.sa$P.Value, breaks = 100, main = "results P-value")

##BRCA1.AF vs NOMUT.AF

# --- Select the patients that have this condition ---
Patients_brca1_af<-Patients_smoking[Patients_smoking$Phenotype=='BRCA1.AF'| Patients_smoking$Phenotype=='NOMUT.AF',]
brca_smk_af1<-brca[,rownames(Patients_brca1_af)]
str(Patients_brca1_af)
## 'data.frame':    12 obs. of  21 variables:
##  $ NUM                    : num  47 44 7 15 31 32 34 37 52 46 ...
##  $ PATIENT INITIAL        : chr  "MPPE" "MCR" "NPV" "MCMF" ...
##  $ ARRAY CODE             : chr  "SG-25.CELL" "SG-27.CELL" "SG-43.CELL" "SG-59.CELL" ...
##  $ DATE OF EXTRACTION     : POSIXct, format: "2005-12-19" "2005-12-20" ...
##  $ DATE OF BIRTH          : POSIXct, format: "1958-05-02" "1960-09-05" ...
##  $ AGE, EXTRACTION TIME   : num  48 45 55 49 33 64 65 39 55 59 ...
##  $ CARRIER                : chr  "BRCA1" "BRCA1" "BRCA1" "BRCA1" ...
##  $ HEALTH CONDITION       : chr  "AFFECTED" "AFFECTED" "AFFECTED" "AFFECTED" ...
##  $ GROUP                  : chr  "NOR-BRCA1" "NOR-BRCA1" "NOR-BRCA1" "NOR-BRCA1" ...
##  $ FAM No                 : num  1 217 81 147 108 144 218 315 NA NA ...
##  $ MUTATION               : chr  "3478delTT" "185delAG" "330A>G" "3958del5 ins4" ...
##  $ EXON                   : num  11 2 5 11 11 11 6 NA NA NA ...
##  $ RNA CODE (NOR)         : num  25 27 43 59 93 95 97 107 115 117 ...
##  $ RNA CODE 2GY (RADIATED): num  26 28 44 60 94 96 98 108 116 118 ...
##  $ QUIMIO                 : Factor w/ 2 levels "NO","SI": 2 2 2 2 2 1 2 2 2 2 ...
##  $ YEARS AFTER QUIMIO     : num  24 5 14 7 2 0 11 2 4 12 ...
##  $ RADIO                  : Factor w/ 2 levels "NO","SI": 2 2 2 2 2 2 1 2 2 2 ...
##  $ YEARS AFTER RADIO      : num  25 4.5 14 7 2 18 0 2 3 12 ...
##  $ SMOKING HISTORY        : Factor w/ 2 levels "NO","SI": 1 2 2 2 1 1 1 2 2 2 ...
##  $ YEARS OF SMOKING       : num  NA NA NA 35 NA NA NA 25 NA NA ...
##  $ Phenotype              : chr  "BRCA1.AF" "BRCA1.AF" "BRCA1.AF" "BRCA1.AF" ...
#Check the order of the colnames of expression matrix and rownames of phenodata is the same
identical(colnames(brca_smk_af1),rownames(Patients_brca1_af))
## [1] TRUE
cond<-as.factor(pData(brca_smk_af1)$Phenotype) #Phenotype
age_brca1_af<- Patients_brca1_af$`AGE, EXTRACTION TIME` # Age
smoking_brca1_af<-Patients_brca1_af$`SMOKING HISTORY`
radio1af<-Patients_brca1_af$RADIO
quimio1af<-Patients_brca1_af$QUIMIO
yearsquimio1af<-Patients_brca1_af$`YEARS AFTER QUIMIO`
yearsradio1af<-Patients_brca1_af$`YEARS AFTER RADIO`

#arrayweights
arraywbrca1.af <- arrayWeights(brca_smk_af1)

# Design a matrix
design <- model.matrix(~0+cond+smoking_brca1_af+age_brca1_af)
rownames(design) <- sampleNames(brca_smk_af1)
colnames(design) <- gsub("cond", "", colnames(design))

# Fit the model 
fit <- lmFit(brca_smk_af1, design, weights = arraywbrca1.af) 

# --- Contrasts ---
contrast.matrix2 <- makeContrasts(cond1=BRCA1.AF-NOMUT.AF,
                                 levels = design)

# --- Fit contrasts and Bayesian adjustment ---
fit2 <- contrasts.fit(fit, contrast.matrix2)
fite <- eBayes(fit2)

# --- Extract results ---
# BRCA1.AF vs NOMUT.AF
top.table.brca1.af <- topTable(fite, coef = 'cond1', number = Inf, adjust = "fdr")
head(top.table.brca1.af)
##                logFC  AveExpr         t      P.Value adj.P.Val         B
## HSD17B7P2  1.5782976 4.142942  6.746314 1.258008e-05 0.1378903 -3.344205
## KIT       -1.0130546 4.963793 -5.024816 2.210245e-04 0.9999461 -3.573513
## NPIPB15    1.4678456 4.676935  4.555366 5.177700e-04 0.9999461 -3.659575
## SLC12A7    0.7814971 8.156584  4.066610 1.291134e-03 0.9999461 -3.762446
## LOC644172  0.9102819 5.200841  3.788669 2.193252e-03 0.9999461 -3.827340
## MGST2     -0.6422527 6.100279 -3.774768 2.252508e-03 0.9999461 -3.830710
# --- Distribution of p-values ---
hist(top.table.brca1.af$P.Value, breaks = 100, main = "results P-value")

11.2 BRCA2.SA vs NOMUT.SA

# --- Select the patients that have this condition ---
Patients_brca2_sa<-Patients_smoking[Patients_smoking$Phenotype=='BRCA2.SA'| Patients_smoking$Phenotype=='NOMUT.SA',]
brca_smk_sa2<-brca[,rownames(Patients_brca2_sa)]
str(Patients_brca2_sa)
## 'data.frame':    16 obs. of  21 variables:
##  $ NUM                    : num  48 43 9 12 13 19 20 22 21 23 ...
##  $ PATIENT INITIAL        : chr  "MPC" "MB" "EJC" "MSM" ...
##  $ ARRAY CODE             : chr  "SG-17.CELL" "SG-21.CELL" "SG-47.CELL" "SG-53.CELL" ...
##  $ DATE OF EXTRACTION     : POSIXct, format: "2005-07-04" "2005-09-12" ...
##  $ DATE OF BIRTH          : POSIXct, format: "1977-01-17" "1950-01-27" ...
##  $ AGE, EXTRACTION TIME   : num  28 55 29 45 36 42 31 33 29 51 ...
##  $ CARRIER                : chr  "BRCA2" "NUMUT" "BRCA2" "BRCA2" ...
##  $ HEALTH CONDITION       : chr  "HEALTHY" "HEALTHY" "HEALTHY" "HEALTHY" ...
##  $ GROUP                  : chr  "NOR-BRCA2" "NOR-NUMUT" "NOR-BRCA2" "NOR-BRCA2" ...
##  $ FAM No                 : num  NA NA 122 71 68 NA NA NA NA NA ...
##  $ MUTATION               : chr  "3492insT" NA "3034-3039del4bp" "3374delA" ...
##  $ EXON                   : num  11 NA 11 11 3 NA NA NA NA NA ...
##  $ RNA CODE (NOR)         : num  17 21 47 53 55 69 71 73 75 77 ...
##  $ RNA CODE 2GY (RADIATED): num  18 22 48 54 56 70 72 74 76 78 ...
##  $ QUIMIO                 : Factor w/ 2 levels "NO","SI": 1 1 1 1 1 1 1 1 1 1 ...
##  $ YEARS AFTER QUIMIO     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ RADIO                  : Factor w/ 2 levels "NO","SI": 1 1 1 1 1 1 1 1 1 1 ...
##  $ YEARS AFTER RADIO      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ SMOKING HISTORY        : Factor w/ 2 levels "NO","SI": 2 2 1 2 2 2 1 1 1 1 ...
##  $ YEARS OF SMOKING       : num  12 20 NA 20 15 20 NA NA NA NA ...
##  $ Phenotype              : chr  "BRCA2.SA" "NOMUT.SA" "BRCA2.SA" "BRCA2.SA" ...
#Check the order of the colnames of expression matrix and rownames of phenodata is the same
identical(colnames(brca_smk_sa2),rownames(Patients_brca2_sa))
## [1] TRUE
cond<-as.factor(pData(brca_smk_sa2)$Phenotype) #Phenotype
age_brca_sa2<- Patients_brca2_sa$`AGE, EXTRACTION TIME` # Age
smoking_brca_sa2<-Patients_brca2_sa$`SMOKING HISTORY`

# Prepare the cibersort variable
rownames(Patients_brca2_sa)<-gsub('-','.',rownames(Patients_brca2_sa))
commonsamples<-intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca2_sa))
cibersort<-Total_T_CD4_ciber[commonsamples,]


# Design a matrix
design <- model.matrix(~0+cond+age_brca_sa2+smoking_brca_sa2+cibersort)
rownames(design) <- sampleNames(brca_smk_sa2)
colnames(design) <- gsub("cond", "", colnames(design))


#arrayweights
arraywbrca2.sa <- arrayWeights(brca_smk_sa2)

# Fit the model 
fit <- lmFit(brca_smk_sa2, design, weights = arraywbrca2.sa) 

# --- Contrasts ---
contrast.matrix <- makeContrasts(cond1=BRCA2.SA-NOMUT.SA,
                                 levels = design)

# --- Fit contrasts and Bayesian adjustment ---
fit2 <- contrasts.fit(fit, contrast.matrix)
fite <- eBayes(fit2)

# --- Extract results ---
# BRCA2.SA vs NOMUT.SA
top.table.brca2.sa <- topTable(fite, coef = 'cond1', number = Inf, adjust = "fdr")
head(top.table.brca2.sa)
##                   logFC  AveExpr         t      P.Value  adj.P.Val         B
## LOC100653061 -1.5422112 6.006135 -7.544113 9.849565e-07 0.01079611 -2.357756
## GUSBP3       -1.5682961 5.975222 -5.098300 9.846041e-05 0.53961230 -2.927540
## GTF2H2B      -1.7625704 6.698422 -4.628401 2.599776e-04 0.69610037 -3.084048
## MICU3        -0.8295324 3.501167 -4.498714 3.411571e-04 0.69610037 -3.130394
## SMA4         -1.2277033 6.513050 -4.443402 3.832465e-04 0.69610037 -3.150581
## C11orf96     -1.6813445 5.734789 -4.394789 4.245892e-04 0.69610037 -3.168530
# --- Distribution of p-values ---
hist(top.table.brca2.sa$P.Value, breaks = 100, main = "results P-value")

In this comparison, there is one gene that is significantly differentially expressed between BRCA2 healthy and NOMUT healthy, LOC100653061, as the adjusted p-value is less than 0.05, and the absolute value of LogFC is over 1.

11.3 BRCA2.AF vs NOMUT.AF

# --- Select the patients that have this condition ---
Patients_brca2.af<-Patients_smoking[Patients_smoking$Phenotype=='BRCA2.AF'| Patients_smoking$Phenotype=='NOMUT.AF',]
brca_smk_brca2.af<-brca[,rownames(Patients_brca2.af)]

# Create a vector of conditions 
cond<-as.factor(pData(brca_smk_brca2.af)$Phenotype) #Phenotype
age_brca2.af<- Patients_brca2.af$`AGE, EXTRACTION TIME` # Age
smoking_brca2.af<-Patients_brca2.af$`SMOKING HISTORY`
radio_brca2.af<-Patients_brca2.af$RADIO
quimio_brca2.af<-Patients_brca2.af$QUIMIO

table(radio_brca2.af)
## radio_brca2.af
## NO SI 
##  1 13
yearsquimio<-Patients_brca2.af$`YEARS AFTER QUIMIO`
yearsradio<- Patients_brca2.af$`YEARS AFTER RADIO`

rownames(Patients_brca2.af)<-gsub('-','.',rownames(Patients_brca2.af))
commonsamples<-intersect(rownames(Total_T_CD4_ciber),rownames(Patients_brca2.af))
cibersort<-Total_T_CD4_ciber[commonsamples,]

#arrayweights
arraywbrca2.af <- arrayWeights(brca_smk_brca2.af)

# Design a matrix
design <- model.matrix(~0+cond+smoking_brca2.af+quimio_brca2.af+age_brca2.af+cibersort)
rownames(design) <- sampleNames(brca_smk_brca2.af)
colnames(design) <- gsub("cond", "", colnames(design))

# Fit the model 
fit <- lmFit(brca_smk_brca2.af, design, weights = arraywbrca2.af) 

# --- Contrasts ---
contrast.matrix <- makeContrasts(cond1=BRCA2.AF-NOMUT.AF,levels = design)

# --- Fit contrasts and Bayesian adjustment ---
fit2 <- contrasts.fit(fit, contrast.matrix)
fite <- eBayes(fit2)

# --- Extract results ---
# BRCA2.AF vs NOMUT.AF
top.table.brca2.AF <- topTable(fite, coef = 'cond1', number = Inf, adjust = "fdr")
head(top.table.brca2.AF)
##                   logFC  AveExpr         t      P.Value adj.P.Val         B
## ATP6V0A4      1.5554843 3.153319  5.312120 0.0001102301 0.6364187 -3.276049
## LOC153577    -0.9480805 4.234649 -5.156106 0.0001464789 0.6364187 -3.308485
## LOC100507103  1.2522676 3.927654  5.061890 0.0001741863 0.6364187 -3.328787
## TWIST1        2.3140685 4.503369  4.880042 0.0002441350 0.6689910 -3.369541
## GUSBP3       -1.2187979 5.754986 -4.672796 0.0003604857 0.6800164 -3.418594
## A2M-AS1      -2.6835502 3.910486 -4.592727 0.0004196347 0.6800164 -3.438311
# --- Distribution of p-values ---
hist(top.table.brca2.AF$P.Value, breaks = 100, main = "results P-value")

11.4 Rank gene list

For the next step, the Gene Set Enrichment Analysis, we need a list with the whole genes of genes ordered. In this case, we will use the combination of the sign of the logFC and the log10 of the P.value to create a metric to rank all the genes.

# --- BRCA1.SA vs NOMUT.SA ----
sign=sign(top.table.brca1.sa$logFC)
logP=-log10(top.table.brca1.sa$P.Value)
metric=logP*sign
geneList = metric
names(geneList) = rownames(top.table.brca1.sa)
geneList_brca1.sa = sort(geneList, decreasing = TRUE)

gene_names_list_brca1.sa <- names(geneList_brca1.sa)

# --- BRCA1.AF vs NOMUT.AF ----
signbrca1af=sign(top.table.brca1.af$logFC)
logPbrca1af=-log10(top.table.brca1.af$P.Value)
metricbrca1.af=logPbrca1af*signbrca1af
geneList_brca1_af = metricbrca1.af
names(geneList_brca1_af) = rownames(top.table.brca1.af)
geneList_brca1_af = sort(geneList_brca1_af, decreasing = TRUE)

gene_names_list_brca1.af <- names(geneList_brca1_af)

# --- BRCA2.SA vs NOMUT.SA ---
signbrca2=sign(top.table.brca2.sa$logFC)
logP_brca2=-log10(top.table.brca2.sa$P.Value)
metricbrca2=logP_brca2*signbrca2
geneList_brca2_sa = metricbrca2
names(geneList_brca2_sa) = rownames(top.table.brca2.sa)
geneList_brca2_sa = sort(geneList_brca2_sa, decreasing = TRUE)


gene_names_list_brca2.sa <- names(geneList_brca2_sa)

# --- BRCA2.AF vs NOMUT.AF ---
signbrca2af=sign(top.table.brca2.AF$logFC)
logP_brca2af=-log10(top.table.brca2.AF$P.Value)
metricbrca2af=logP_brca2af*signbrca2af
geneList_brca2_af = metricbrca2af
names(geneList_brca2_af) = rownames(top.table.brca2.AF)
geneList_brca2_af = sort(geneList_brca2_af, decreasing = TRUE)

gene_names_list_brca2.af <-  names(geneList_brca2_af)

12 Functional analysis

The steps to perform gene set enrichment analysis comparing different conditions are described next.

  • First, data has to be normalized and preprocessed, which is done and explained in the previous steps.

  • Define curated gene sets or pathways. There are gene set databases like MSigDB (Molecular Signatures Database) or Reactome, that provide pre-defined gene sets representing various biological processes and pathways.

  • Perform differential gene expression analysis comparing the different conditions. The aim of this step is to obtain a complete list of genes (ALL genes) ranked by an statistical parameter. In this case, to rank the genes of the sample data results, we will combine the sign of the logFC and the p-value.

  • Conduct GSEA analysis. clusterProfiler R package enables to perform GSEA analysis.

We will use MSigDB to get the gene set collections. Within this database, we can access to Hallmark or to curated gene sets, which include canonical pathways gene sets derived from the Reactome database. The GSEA is perform using the Hallmark gene set collection and the Reactome. Results are finally compared.

12.1 GSEA - mSigDB -Hallmark

We will use data from the hallmark gene set collection. Hallmark gene sets are coherently expressed signatures derived by aggregating many MSigDB gene sets to represent well-defined biological states or processes. In this collection, there are 50 gene sets.

library(enrichplot)
## 
## 
## Attaching package: 'enrichplot'
## The following object is masked from 'package:ggpubr':
## 
##     color_palette
library(msigdbr)

h_gene_sets = msigdbr(species = "Homo sapiens", category = "H")
H.entrez <- h_gene_sets %>% dplyr::select(gs_name, entrez_gene)
H.symbol <- h_gene_sets %>% dplyr::select(gs_name, gene_symbol)

12.1.1 BRCA1.SA vs NOMUT.SA

library(ggplot2)
library(clusterProfiler)
set.seed(123)
resGsea <- GSEA(geneList_brca1.sa, TERM2GENE = H.symbol)
head(resGsea)
##                                                            ID
## HALLMARK_MYC_TARGETS_V1               HALLMARK_MYC_TARGETS_V1
## HALLMARK_E2F_TARGETS                     HALLMARK_E2F_TARGETS
## HALLMARK_G2M_CHECKPOINT               HALLMARK_G2M_CHECKPOINT
## HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_INFLAMMATORY_RESPONSE
## HALLMARK_COMPLEMENT                       HALLMARK_COMPLEMENT
## HALLMARK_HYPOXIA                             HALLMARK_HYPOXIA
##                                                   Description setSize
## HALLMARK_MYC_TARGETS_V1               HALLMARK_MYC_TARGETS_V1     133
## HALLMARK_E2F_TARGETS                     HALLMARK_E2F_TARGETS     161
## HALLMARK_G2M_CHECKPOINT               HALLMARK_G2M_CHECKPOINT     139
## HALLMARK_INFLAMMATORY_RESPONSE HALLMARK_INFLAMMATORY_RESPONSE     163
## HALLMARK_COMPLEMENT                       HALLMARK_COMPLEMENT     144
## HALLMARK_HYPOXIA                             HALLMARK_HYPOXIA     142
##                                enrichmentScore       NES pvalue     p.adjust
## HALLMARK_MYC_TARGETS_V1              0.6813576  2.874468  1e-10 7.142857e-10
## HALLMARK_E2F_TARGETS                 0.6420095  2.772205  1e-10 7.142857e-10
## HALLMARK_G2M_CHECKPOINT              0.5551864  2.369370  1e-10 7.142857e-10
## HALLMARK_INFLAMMATORY_RESPONSE      -0.5605903 -2.280255  1e-10 7.142857e-10
## HALLMARK_COMPLEMENT                 -0.5747562 -2.278810  1e-10 7.142857e-10
## HALLMARK_HYPOXIA                    -0.5651075 -2.235827  1e-10 7.142857e-10
##                                      qvalue rank                   leading_edge
## HALLMARK_MYC_TARGETS_V1        2.857143e-10 2177 tags=65%, list=20%, signal=53%
## HALLMARK_E2F_TARGETS           2.857143e-10 3024 tags=72%, list=28%, signal=53%
## HALLMARK_G2M_CHECKPOINT        2.857143e-10 2918 tags=63%, list=27%, signal=47%
## HALLMARK_INFLAMMATORY_RESPONSE 2.857143e-10 1829 tags=40%, list=17%, signal=34%
## HALLMARK_COMPLEMENT            2.857143e-10 1862 tags=38%, list=17%, signal=32%
## HALLMARK_HYPOXIA               2.857143e-10 2038 tags=44%, list=19%, signal=37%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                core_enrichment
## HALLMARK_MYC_TARGETS_V1                                                                                                                                                                                 HSPD1/CCT4/EIF2S1/PSMD14/RFC4/EIF2S2/CSTF2/NOLC1/RRM1/PSMA1/NOP16/TRA2B/CCT2/CTPS1/TXNL4A/HSP90AB1/NPM1/VDAC3/SNRPD1/GNL3/NCBP2/PWP1/SERBP1/HDDC2/ORC2/CDK4/PSMA2/SNRPD3/AIMP2/APEX1/CDC20/ETF1/ILF2/CCT5/HSPE1/TCP1/ABCE1/COPS5/XPOT/DDX18/EIF3J/SNRPG/RRP9/PHB2/RANBP1/GOT2/NDUFAB1/POLE3/NME1/EIF3B/MRPL9/RPL14/PHB/SNRPA1/DDX21/ACP1/SYNCRIP/SNRPD2/ODC1/SRSF1/RAN/MCM6/MCM7/TUFM/RUVBL2/COX5A/SSBP1/PSMD1/EXOSC7/MAD2L1/PSMC6/DUT/CCT3/PCNA/TYMS/MYC/IFRD1/CDC45/USP1/KPNA2/MCM4/PSMA7/PSMC4/CCNA2/LSM7/MCM2/CCT7
## HALLMARK_E2F_TARGETS           RAD51C/CENPE/EIF2S1/NOLC1/MSH2/STMN1/TRA2B/CTPS1/NUP107/TIPIN/POLD3/UBE2T/CSE1L/PLK4/EXOSC8/POP7/DONSON/SMC4/GINS3/PSMC3IP/LYAR/ORC2/CDK4/CDC20/DCTPP1/HELLS/BRMS1L/DDX39A/RFC3/RAD1/CDKN3/TRIP13/RANBP1/LIG1/NME1/ANP32E/CDCA3/CHEK1/TCF19/CBX5/KIF2C/UBR7/SNRPB/DSCC1/SYNCRIP/POLA2/KIF22/CKS2/PRPS1/SRSF1/RAN/AURKA/SMC6/TFRC/BIRC5/DLGAP5/MCM6/E2F8/MCM3/MCM7/PRIM2/BUB1B/EED/TK1/CKS1B/MTHFD2/RNASEH2A/MAD2L1/DUT/SPAG5/SPC25/CCNE1/PCNA/BARD1/MYC/NUDT21/CDK1/PNN/WEE1/KIF18B/USP1/KPNA2/CDC25A/MCM4/AK2/DNMT1/CCP110/ATAD2/NASP/MCM2/DCLRE1B/HMMR/GINS1/PTTG1/MKI67/ORC6/AURKB/GINS4/ASF1B/RRM2/PHF5A/UNG/RFC2/NOP56/CENPM/CDCA8/RPA3/SMC3/MCM5/RACGAP1/LMNB1/MELK/TMPO/ZW10/CIT/SUV39H1
## HALLMARK_G2M_CHECKPOINT                                                                                                                                                                                         NCL/CENPE/NOLC1/STMN1/TRA2B/DKC1/HIRA/SNRPD1/PLK4/ORC5/SMC2/CDC27/SLC7A1/FBXO5/SMC4/NDC80/CDK4/CDC20/SRSF10/MNAT1/DDX39A/BUB1/TROAP/TTK/CDKN3/EXO1/CENPF/CHEK1/KIF2C/DBF4/CBX1/STIL/SYNCRIP/POLA2/KIF22/CKS2/ODC1/SRSF1/PRMT5/AURKA/RAD54L/RBL1/BIRC5/MCM6/CDC6/MCM3/E2F2/PRIM2/POLQ/CKS1B/UBE2C/CDC7/MAD2L1/BARD1/MYC/CDK1/CDC45/KPNA2/CDC25A/PBK/NASP/UCK2/CCNA2/MCM2/HMMR/KIF15/PTTG1/SS18/MKI67/NUSAP1/CHAF1A/ORC6/AURKB/E2F1/KIF23/PRC1/SMARCC1/SQLE/SLC12A2/MCM5/RACGAP1/WRN/KIF11/GINS2/LMNB1/TPX2/TMPO
## HALLMARK_INFLAMMATORY_RESPONSE                                                                                                                                                                                                                                                                                                     IRAK2/TAPBP/CCL22/CD55/NMUR1/CLEC5A/TNFSF15/SLC11A2/SGMS2/KCNJ2/P2RY2/NLRP3/SLC31A2/INHBA/IL4R/ITGB8/SPHK1/RIPK2/RHOG/MET/IL1B/PTGIR/KLF6/MXD1/EBI3/PDPN/IL10RA/BEST1/PIK3R5/TLR1/STAB1/MMP14/AHR/CXCL6/CCL20/CD82/LYN/HBEGF/C5AR1/HRH1/FPR1/CDKN1A/PTPRE/SCARF1/ABCA1/CD14/MARCO/ICAM1/ITGA5/OLR1/NAMPT/CSF3R/TIMP1/AQP9/EREG/TNFRSF1B/PTAFR/PLAUR/ADM/RNF144B/GNA15/OSMR/NOD2/TLR2/IFNGR2
## HALLMARK_COMPLEMENT                                                                                                                                                                                                                                                                                                                                                                                                 CSRP1/CBLB/CD55/LGALS3/SERPING1/CFB/CTSL/PRSS36/ANG/CXCL1/LAMP2/PDP1/RHOG/FCER1G/CTSD/CASP4/SRC/DOCK4/MAFF/CTSV/EHD1/CASP1/LTF/PIK3R5/CPQ/SERPINA1/DUSP6/MMP14/CTSO/KYNU/LYN/CPM/IRF2/ANXA5/CTSS/GNAI2/C3/ADAM9/GCA/OLR1/GNB4/ITGAM/MT3/S100A12/S100A9/CDA/PRKCD/TIMP2/CEBPB/TIMP1/CTSB/PLAUR/CD59/PLA2G4A
## HALLMARK_HYPOXIA                                                                                                                                                                                                                                                                                                                              STBD1/GAA/DPYSL4/P4HA1/CHST2/SDC4/CCNG2/SDC2/ANXA2/STC2/ATP7A/GLRX/PNRC1/AMPD3/CITED2/KLHL24/S100A4/SLC2A3/CDKN1C/TPI1/LXN/FOXO3/PLIN2/BTG1/VLDLR/CAV1/HAS1/ZFP36/PLAC8/SLC6A6/FBP1/GCNT2/HS3ST1/STC1/HMOX1/MAFF/KLF6/SLC2A5/BNIP3L/P4HA2/FOSL2/PFKFB3/GYS1/PAM/ERO1A/NEDD4L/ACKR3/IER3/PPP1R15A/CDKN1A/TGM2/ANGPTL4/IRS2/HK2/TGFBI/TMEM45A/MT2A/NDRG1/PLAUR/ADM/VEGFA/MT1E/CA12
#plots
resGseaR<-resGsea@result[resGsea@result$p.adjust<0.05,]

# Barplot
barplot=ggplot(resGseaR, aes(x=ID, y=NES,fill=p.adjust)) + 
  geom_bar(stat = "identity") +
  scale_fill_continuous(low='red', high='blue')+
  coord_flip() +
  theme_bw()

barplot

The gene sets with a positive NES value, indicate that there is an enrichment at the top of the ranked gene list. On the other hand, for those gene sets whose NES value is negative, that means they are enriched at the bottom of the ranked gene list.

Most of the positive gene sets are related with cell proliferation pathways.

List of negative gene sets:

# --- Negative NES Pathways ---
negative_pathways_brca1_sa<-resGseaR[resGseaR$NES<0,]$ID
negative_pathways_brca1_sa
##  [1] "HALLMARK_INFLAMMATORY_RESPONSE"            
##  [2] "HALLMARK_COMPLEMENT"                       
##  [3] "HALLMARK_HYPOXIA"                          
##  [4] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"          
##  [5] "HALLMARK_IL6_JAK_STAT3_SIGNALING"          
##  [6] "HALLMARK_ANGIOGENESIS"                     
##  [7] "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"
##  [8] "HALLMARK_TGF_BETA_SIGNALING"               
##  [9] "HALLMARK_KRAS_SIGNALING_UP"                
## [10] "HALLMARK_COAGULATION"                      
## [11] "HALLMARK_APICAL_JUNCTION"                  
## [12] "HALLMARK_P53_PATHWAY"                      
## [13] "HALLMARK_ANDROGEN_RESPONSE"                
## [14] "HALLMARK_XENOBIOTIC_METABOLISM"            
## [15] "HALLMARK_IL2_STAT5_SIGNALING"              
## [16] "HALLMARK_APOPTOSIS"                        
## [17] "HALLMARK_INTERFERON_GAMMA_RESPONSE"        
## [18] "HALLMARK_APICAL_SURFACE"

List of positive gene sets:

# --- Positive NES Pathways ---
positive_pathways_brca1_sa<-resGseaR[resGseaR$NES>0,]$ID
positive_pathways_brca1_sa
## [1] "HALLMARK_MYC_TARGETS_V1"            "HALLMARK_E2F_TARGETS"              
## [3] "HALLMARK_G2M_CHECKPOINT"            "HALLMARK_MYC_TARGETS_V2"           
## [5] "HALLMARK_OXIDATIVE_PHOSPHORYLATION" "HALLMARK_SPERMATOGENESIS"          
## [7] "HALLMARK_DNA_REPAIR"                "HALLMARK_MTORC1_SIGNALING"         
## [9] "HALLMARK_UNFOLDED_PROTEIN_RESPONSE"

12.1.2 BRCA1.AF vs NOMUT.AF

set.seed(123)
resGsea.brca1.af <- GSEA(geneList_brca1_af, TERM2GENE = H.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
head(resGsea.brca1.af)
##                                                                                    ID
## HALLMARK_HYPOXIA                                                     HALLMARK_HYPOXIA
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION
## HALLMARK_MYC_TARGETS_V2                                       HALLMARK_MYC_TARGETS_V2
## HALLMARK_KRAS_SIGNALING_UP                                 HALLMARK_KRAS_SIGNALING_UP
## HALLMARK_INTERFERON_ALPHA_RESPONSE                 HALLMARK_INTERFERON_ALPHA_RESPONSE
## HALLMARK_ANGIOGENESIS                                           HALLMARK_ANGIOGENESIS
##                                                                           Description
## HALLMARK_HYPOXIA                                                     HALLMARK_HYPOXIA
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION
## HALLMARK_MYC_TARGETS_V2                                       HALLMARK_MYC_TARGETS_V2
## HALLMARK_KRAS_SIGNALING_UP                                 HALLMARK_KRAS_SIGNALING_UP
## HALLMARK_INTERFERON_ALPHA_RESPONSE                 HALLMARK_INTERFERON_ALPHA_RESPONSE
## HALLMARK_ANGIOGENESIS                                           HALLMARK_ANGIOGENESIS
##                                            setSize enrichmentScore       NES
## HALLMARK_HYPOXIA                               142      -0.5243976 -2.207029
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION     114      -0.5269156 -2.159864
## HALLMARK_MYC_TARGETS_V2                         52       0.5891892  1.991681
## HALLMARK_KRAS_SIGNALING_UP                     127      -0.4190948 -1.736538
## HALLMARK_INTERFERON_ALPHA_RESPONSE              80       0.4966624  1.803902
## HALLMARK_ANGIOGENESIS                           26      -0.6637337 -2.035624
##                                                  pvalue     p.adjust
## HALLMARK_HYPOXIA                           4.149985e-10 2.074993e-08
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 2.121442e-08 5.303604e-07
## HALLMARK_MYC_TARGETS_V2                    6.650715e-05 1.108453e-03
## HALLMARK_KRAS_SIGNALING_UP                 1.072570e-04 1.340713e-03
## HALLMARK_INTERFERON_ALPHA_RESPONSE         2.160582e-04 2.160582e-03
## HALLMARK_ANGIOGENESIS                      2.929306e-04 2.441089e-03
##                                                  qvalue rank
## HALLMARK_HYPOXIA                           1.572626e-08 1821
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION 4.019573e-07 1823
## HALLMARK_MYC_TARGETS_V2                    8.400904e-04 3766
## HALLMARK_KRAS_SIGNALING_UP                 1.016119e-03 2033
## HALLMARK_INTERFERON_ALPHA_RESPONSE         1.637494e-03 2435
## HALLMARK_ANGIOGENESIS                      1.850088e-03 1116
##                                                              leading_edge
## HALLMARK_HYPOXIA                           tags=38%, list=17%, signal=32%
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION tags=37%, list=17%, signal=31%
## HALLMARK_MYC_TARGETS_V2                    tags=69%, list=34%, signal=46%
## HALLMARK_KRAS_SIGNALING_UP                 tags=40%, list=19%, signal=33%
## HALLMARK_INTERFERON_ALPHA_RESPONSE         tags=44%, list=22%, signal=34%
## HALLMARK_ANGIOGENESIS                      tags=54%, list=10%, signal=48%
##                                                                                                                                                                                                                                                                                                                                                               core_enrichment
## HALLMARK_HYPOXIA                                            PNRC1/MAFF/FOXO3/MT2A/AMPD3/JUN/FOS/ERO1A/GPI/PLAUR/P4HA2/CAV1/RORA/GBE1/IER3/P4HA1/MAP3K1/BNIP3L/TPBG/GAA/CCNG2/STC1/ADM/PDK1/PKP1/NR3C1/NDRG1/GRHPR/IRS2/CA12/F3/SLC2A3/CDKN1B/VEGFA/TIPARP/PGK1/VLDLR/KLHL24/IGFBP3/GPC3/PAM/IL6/PFKFB3/ANGPTL4/ANXA2/PLIN2/AKAP12/STBD1/TMEM45A/EFNA1/CXCR4/SDC2/ACKR3/S100A4
## HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION                                                                           COL16A1/GAS1/FERMT2/JUN/GJA1/CXCL1/PRSS2/COL1A1/PLAUR/ITGB1/NT5E/LOXL1/CD44/NOTCH2/COL6A3/PMP22/ANPEP/MMP2/TIMP1/HTRA1/CADM1/VEGFA/ITGB3/THBS1/FGF2/COPA/MMP3/IGFBP3/LGALS1/ITGAV/IL6/APLP1/WNT5A/IGFBP2/CXCL8/VCAN/PLOD2/FBLN5/ADAM12/GLIPR1/ACTA2/AREG
## HALLMARK_MYC_TARGETS_V2                                                                                                                                      PPRC1/DUSP2/MPHOSPH10/NOP2/LAS1L/TCOF1/WDR43/RABEPK/NDUFAF4/MRTO4/SUPV3L1/GRWD1/TBRG4/IMP4/GNL3/PPAN/MYBBP1A/MCM5/SRM/SORD/NOC4L/BYSL/PLK4/NIP7/FARSA/PRMT3/CDK4/PA2G4/DDX18/PUS1/WDR74/RRP9/RCL1/NOP56/MYC/IPO4
## HALLMARK_KRAS_SIGNALING_UP                 PCSK1N/MAFB/SCG5/ETS1/TNFRSF1B/ITGB2/LAPTM5/FGF9/F13A1/TSPAN7/RABGAP1L/ABCB1/ERO1A/ADAMDEC1/GADD45G/LY96/PECAM1/GNG11/GPNMB/ST6GAL1/PLAUR/FCER1G/SERPINA3/NRP1/SCN1B/MMD/MAP3K1/TLR8/ALDH1A2/PLEK2/RETN/EREG/F2RL1/EPB41L3/APOD/ADAM17/NR0B2/CCL20/IGFBP3/IL1B/CSF2RA/ANGPTL4/SPON1/PTGS2/PTBP2/AKAP12/TRIB2/IGF2/CXCR4/IKZF1/G0S2
## HALLMARK_INTERFERON_ALPHA_RESPONSE                                                                                                                          CXCL10/LY6E/UBA7/HELZ2/MVB12A/TRIM26/MOV10/IRF2/IFIH1/DHX58/EIF2AK2/LAP3/RSAD2/SP110/CMTR1/TRIM21/PNPT1/PARP12/IFI27/TRAFD1/CXCL11/OAS1/ISG15/IL4R/BATF2/TAP1/USP18/IRF7/HERC6/LGALS3BP/IFI44/CD74/GMPR/IL7/TDRD7
## HALLMARK_ANGIOGENESIS                                                                                                                                                                                                                                                                        APP/NRP1/STC1/TIMP1/OLR1/VTN/TNFRSF21/VEGFA/PRG2/ITGAV/THBD/VCAN/SERPINA5/S100A4
#plots
resGseaR.brca1.af<-resGsea.brca1.af@result[resGsea.brca1.af@result$p.adjust<0.05,]


p=ggplot(resGseaR.brca1.af, aes(x=ID, y=NES,fill=p.adjust)) + 
  geom_bar(stat = "identity") +
  scale_fill_continuous(low='red', high='blue')+
  coord_flip() +
  theme_bw()

p

# --- Negative NES Pathways ---
negative_pathways_brca1_af<-resGsea.brca1.af[resGsea.brca1.af$NES<0,]$ID
negative_pathways_brca1_af
## [1] "HALLMARK_HYPOXIA"                          
## [2] "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"
## [3] "HALLMARK_KRAS_SIGNALING_UP"                
## [4] "HALLMARK_ANGIOGENESIS"                     
## [5] "HALLMARK_COAGULATION"                      
## [6] "HALLMARK_UV_RESPONSE_DN"                   
## [7] "HALLMARK_INFLAMMATORY_RESPONSE"
# --- Positive NES Pathways ---
positive_pathways_brca1_af<-resGsea.brca1.af[resGsea.brca1.af$NES>0,]$ID
positive_pathways_brca1_af
## [1] "HALLMARK_MYC_TARGETS_V2"            "HALLMARK_INTERFERON_ALPHA_RESPONSE"
## [3] "HALLMARK_MYC_TARGETS_V1"            "HALLMARK_E2F_TARGETS"              
## [5] "HALLMARK_INTERFERON_GAMMA_RESPONSE"

12.1.3 BRCA2.SA vs NOMUT.SA

set.seed(123)
resGsea_brca2.sa <- GSEA(geneList_brca2_sa, TERM2GENE = H.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
head(resGsea_brca2.sa)
##                                                                ID
## HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB
## HALLMARK_MYC_TARGETS_V1                   HALLMARK_MYC_TARGETS_V1
## HALLMARK_E2F_TARGETS                         HALLMARK_E2F_TARGETS
## HALLMARK_INFLAMMATORY_RESPONSE     HALLMARK_INFLAMMATORY_RESPONSE
## HALLMARK_HYPOXIA                                 HALLMARK_HYPOXIA
## HALLMARK_G2M_CHECKPOINT                   HALLMARK_G2M_CHECKPOINT
##                                                       Description setSize
## HALLMARK_TNFA_SIGNALING_VIA_NFKB HALLMARK_TNFA_SIGNALING_VIA_NFKB     179
## HALLMARK_MYC_TARGETS_V1                   HALLMARK_MYC_TARGETS_V1     133
## HALLMARK_E2F_TARGETS                         HALLMARK_E2F_TARGETS     161
## HALLMARK_INFLAMMATORY_RESPONSE     HALLMARK_INFLAMMATORY_RESPONSE     163
## HALLMARK_HYPOXIA                                 HALLMARK_HYPOXIA     142
## HALLMARK_G2M_CHECKPOINT                   HALLMARK_G2M_CHECKPOINT     139
##                                  enrichmentScore       NES       pvalue
## HALLMARK_TNFA_SIGNALING_VIA_NFKB      -0.5390011 -2.371270 1.000000e-10
## HALLMARK_MYC_TARGETS_V1                0.5721128  2.324776 1.000000e-10
## HALLMARK_E2F_TARGETS                   0.5489981  2.298289 1.000000e-10
## HALLMARK_INFLAMMATORY_RESPONSE        -0.5035802 -2.192776 2.371113e-10
## HALLMARK_HYPOXIA                      -0.5246904 -2.238497 4.215725e-10
## HALLMARK_G2M_CHECKPOINT                0.4747939  1.939488 7.242546e-07
##                                      p.adjust       qvalue rank
## HALLMARK_TNFA_SIGNALING_VIA_NFKB 1.666667e-09 1.157895e-09 2083
## HALLMARK_MYC_TARGETS_V1          1.666667e-09 1.157895e-09 2912
## HALLMARK_E2F_TARGETS             1.666667e-09 1.157895e-09 2414
## HALLMARK_INFLAMMATORY_RESPONSE   2.963892e-09 2.059125e-09 1912
## HALLMARK_HYPOXIA                 4.215725e-09 2.928820e-09 1541
## HALLMARK_G2M_CHECKPOINT          6.035455e-06 4.193053e-06 2303
##                                                    leading_edge
## HALLMARK_TNFA_SIGNALING_VIA_NFKB tags=43%, list=19%, signal=35%
## HALLMARK_MYC_TARGETS_V1          tags=61%, list=27%, signal=45%
## HALLMARK_E2F_TARGETS             tags=53%, list=22%, signal=42%
## HALLMARK_INFLAMMATORY_RESPONSE   tags=34%, list=17%, signal=28%
## HALLMARK_HYPOXIA                 tags=36%, list=14%, signal=31%
## HALLMARK_G2M_CHECKPOINT          tags=43%, list=21%, signal=35%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               core_enrichment
## HALLMARK_TNFA_SIGNALING_VIA_NFKB                                          NR4A1/CCRL2/TRIB1/CCNL1/AREG/PDE4B/TNFSF9/BHLHE40/FOSB/ATP2B1/NAMPT/NR4A3/LITAF/NFKB2/CCL5/CD80/BCL3/ZC3H12A/SOCS3/KYNU/PNRC1/MARCKS/ETS2/CFLAR/SLC2A3/LAMB3/NFIL3/IER3/NFAT5/SLC2A6/STAT5A/BCL2A1/JUNB/CEBPB/NFKBIE/CCND1/GPR183/ABCA1/SPHK1/ZFP36/B4GALT5/TRIP10/PER1/RNF19B/TLR2/SGK1/PPP1R15A/F3/SAT1/IER5/NR4A2/TRAF1/GADD45A/BCL6/MAP3K8/MSC/KLF6/MAFF/DUSP5/FOSL2/RELA/CDKN1A/TNFAIP2/ZBTB10/TNFRSF9/TNFAIP3/BTG1/ACKR3/VEGFA/TNIP1/IFNGR2/DENND5A/KDM6B/REL/ID2/ICAM1/RIPK2
## HALLMARK_MYC_TARGETS_V1                                                      RANBP1/MAD2L1/CCT5/DUT/MCM4/RRM1/CCT4/CDK4/APEX1/NOLC1/PHB/SERBP1/PSMB3/PSMA1/MCM2/CCT7/CDC20/CCT2/SLC25A3/RFC4/ABCE1/VDAC3/NME1/HDAC2/MCM7/MRPL9/NCBP1/PSMA7/AIMP2/HSPE1/PCNA/USP1/NHP2/TCP1/NPM1/CTPS1/CSTF2/ILF2/MYC/CYC1/GOT2/NOP16/PRPS2/PHB2/EIF3J/TFDP1/LSM2/EIF2S1/SNRPD1/CCNA2/RRP9/SNRPG/NDUFAB1/EIF1AX/PSMD14/DDX18/CDC45/SF3B3/TYMS/HSPD1/NOP56/MRPS18B/POLE3/SRM/PSMD1/SYNCRIP/PSMC6/FBL/SSB/MRPL23/RUVBL2/HDDC2/PSMA2/CCT3/COX5A/ETF1/VBP1/TUFM/PSMC4/EXOSC7/C1QBP
## HALLMARK_E2F_TARGETS             RFC3/RANBP1/MAD2L1/CIT/TBRG4/POP7/MYBL2/CSE1L/KIF18B/DUT/AURKB/UBE2T/PLK4/MCM4/TIPIN/CDK4/CCNE1/SUV39H1/NOLC1/ASF1B/NUP107/SPC24/PRPS1/MCM2/CDC20/BIRC5/DDX39A/RRM2/TK1/CDC25A/DLGAP5/LMNB1/SPC25/MELK/GINS3/POLD3/TRIP13/NME1/DONSON/LYAR/MKI67/MCM7/RACGAP1/PCNA/CKS1B/RAD51C/USP1/CDK1/STMN1/DCTPP1/SLBP/DNMT1/DCK/CHEK1/ASF1A/TUBG1/CTPS1/SNRPB/MSH2/MYC/CDKN3/TMPO/CDCA3/EIF2S1/NUP205/WEE1/BRCA1/TOP2A/RNASEH2A/PSMC3IP/ATAD2/RPA3/MCM3/KIF2C/NCAPD2/NUDT21/NOP56/PRIM2/ANP32E/RAD51AP1/KIF4A/POLA2/CENPM/SYNCRIP/RAD1
## HALLMARK_INFLAMMATORY_RESPONSE                                                                                                                                                                                                PDE4B/TNFSF9/IL10/SCARF1/ATP2B1/MEFV/NAMPT/CD55/CD48/ITGB8/CCL5/TPBG/AQP9/RGS1/MARCO/IL4R/TNFRSF1B/GPR183/CCL22/ABCA1/NDP/SPHK1/ICAM4/SRI/PIK3R5/RHOG/CD70/TLR2/CXCR6/AHR/F3/GNA15/PTGIR/PTAFR/KLF6/NLRP3/RELA/CDKN1A/IL15/SCN1B/TNFRSF9/RNF144B/SLC1A2/RASGRP1/SLC11A2/HRH1/IFNGR2/KCNJ2/ICAM1/OSMR/IL10RA/EREG/RIPK2/NOD2/ADM
## HALLMARK_HYPOXIA                                                                                                                                                                                                                                    TPBG/GCNT2/ANKZF1/ENO1/PAM/PNRC1/MT1E/CDKN1C/SLC2A3/P4HA1/PPFIA4/CXCR4/NFIL3/IER3/ENO2/UGP2/RBPJ/PLIN2/ERRFI1/PDK1/ZFP36/TMEM45A/LXN/KLHL24/PPP1R15A/HSPA5/F3/HK2/ERO1A/BNIP3L/VLDLR/CCNG2/ZNF292/WSB1/KLF6/MAFF/FOSL2/CDKN1A/TNFAIP3/DDIT3/BTG1/ACKR3/VEGFA/NR3C1/GYS1/TGM2/TPST2/CA12/ANGPTL4/NDRG1/ADM
## HALLMARK_G2M_CHECKPOINT                                                                                                                                                                      HOXC10/MAD2L1/TROAP/UBE2C/MYBL2/BUB1/AURKB/GINS2/SLC7A1/PLK4/E2F2/CDK4/SUV39H1/NOLC1/NDC80/MCM2/CDC20/BIRC5/DDX39A/CHAF1A/CDC25A/LMNB1/RAD54L/DKC1/CENPA/SMC2/MKI67/SRSF10/TPX2/CBX1/RACGAP1/POLQ/KIF11/CKS1B/NUSAP1/CDK1/STMN1/CHEK1/MYC/CDKN3/TMPO/CDC6/EXO1/TFDP1/SNRPD1/CCNA2/INCENP/TOP2A/NEK2/STIL/E2F1/MCM3/KIF2C/PRMT5/CDC45/PBK/PRIM2/KIF4A/CENPF/POLA2
resGseaR_brca2.sa<-resGsea_brca2.sa@result[resGsea_brca2.sa@result$p.adjust<0.05,]


p=ggplot(resGseaR_brca2.sa, aes(x=ID, y=NES,fill=p.adjust)) + 
  geom_bar(stat = "identity") +
  scale_fill_continuous(low='red', high='blue')+
  coord_flip() +
  theme_bw()

p

# --- Negative NES Pathways ---
negative_pathways_brca2_sa<-resGseaR_brca2.sa[resGseaR_brca2.sa$NES<0,]$ID
negative_pathways_brca2_sa
##  [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"   "HALLMARK_INFLAMMATORY_RESPONSE"    
##  [3] "HALLMARK_HYPOXIA"                   "HALLMARK_COMPLEMENT"               
##  [5] "HALLMARK_INTERFERON_GAMMA_RESPONSE" "HALLMARK_IL2_STAT5_SIGNALING"      
##  [7] "HALLMARK_ALLOGRAFT_REJECTION"       "HALLMARK_TGF_BETA_SIGNALING"       
##  [9] "HALLMARK_PROTEIN_SECRETION"         "HALLMARK_KRAS_SIGNALING_UP"        
## [11] "HALLMARK_P53_PATHWAY"
# --- Positive NES Pathways ---
positive_pathways_brca2_sa<-resGseaR_brca2.sa[resGseaR_brca2.sa$NES>0,]$ID
positive_pathways_brca2_sa
## [1] "HALLMARK_MYC_TARGETS_V1"            "HALLMARK_E2F_TARGETS"              
## [3] "HALLMARK_G2M_CHECKPOINT"            "HALLMARK_MYC_TARGETS_V2"           
## [5] "HALLMARK_OXIDATIVE_PHOSPHORYLATION" "HALLMARK_SPERMATOGENESIS"

12.1.4 BRCA2.AF vs NOMUT.AF

set.seed(123)
resGsea.brca2.af <- GSEA(geneList_brca2_af, TERM2GENE = H.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
head(resGsea.brca2.af)
##                                                                    ID
## HALLMARK_TNFA_SIGNALING_VIA_NFKB     HALLMARK_TNFA_SIGNALING_VIA_NFKB
## HALLMARK_HYPOXIA                                     HALLMARK_HYPOXIA
## HALLMARK_INFLAMMATORY_RESPONSE         HALLMARK_INFLAMMATORY_RESPONSE
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE
## HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE
## HALLMARK_GLYCOLYSIS                               HALLMARK_GLYCOLYSIS
##                                                           Description setSize
## HALLMARK_TNFA_SIGNALING_VIA_NFKB     HALLMARK_TNFA_SIGNALING_VIA_NFKB     179
## HALLMARK_HYPOXIA                                     HALLMARK_HYPOXIA     142
## HALLMARK_INFLAMMATORY_RESPONSE         HALLMARK_INFLAMMATORY_RESPONSE     163
## HALLMARK_INTERFERON_GAMMA_RESPONSE HALLMARK_INTERFERON_GAMMA_RESPONSE     168
## HALLMARK_INTERFERON_ALPHA_RESPONSE HALLMARK_INTERFERON_ALPHA_RESPONSE      80
## HALLMARK_GLYCOLYSIS                               HALLMARK_GLYCOLYSIS     133
##                                    enrichmentScore       NES       pvalue
## HALLMARK_TNFA_SIGNALING_VIA_NFKB        -0.5421456 -2.266193 1.000000e-10
## HALLMARK_HYPOXIA                        -0.5498362 -2.217124 1.771833e-10
## HALLMARK_INFLAMMATORY_RESPONSE          -0.4705958 -1.952521 1.709493e-07
## HALLMARK_INTERFERON_GAMMA_RESPONSE      -0.4323918 -1.797035 7.139468e-06
## HALLMARK_INTERFERON_ALPHA_RESPONSE      -0.4599663 -1.719380 9.574804e-04
## HALLMARK_GLYCOLYSIS                     -0.3781977 -1.519581 2.506683e-03
##                                        p.adjust       qvalue rank
## HALLMARK_TNFA_SIGNALING_VIA_NFKB   4.429584e-09 2.890886e-09 2334
## HALLMARK_HYPOXIA                   4.429584e-09 2.890886e-09 2656
## HALLMARK_INFLAMMATORY_RESPONSE     2.849155e-06 1.859448e-06 1808
## HALLMARK_INTERFERON_GAMMA_RESPONSE 8.924335e-05 5.824303e-05 2092
## HALLMARK_INTERFERON_ALPHA_RESPONSE 9.574804e-03 6.248819e-03 2835
## HALLMARK_GLYCOLYSIS                1.790488e-02 1.168529e-02 1815
##                                                      leading_edge
## HALLMARK_TNFA_SIGNALING_VIA_NFKB   tags=53%, list=21%, signal=42%
## HALLMARK_HYPOXIA                   tags=54%, list=24%, signal=41%
## HALLMARK_INFLAMMATORY_RESPONSE     tags=38%, list=16%, signal=32%
## HALLMARK_INTERFERON_GAMMA_RESPONSE tags=42%, list=19%, signal=35%
## HALLMARK_INTERFERON_ALPHA_RESPONSE tags=48%, list=26%, signal=35%
## HALLMARK_GLYCOLYSIS                tags=26%, list=17%, signal=22%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   core_enrichment
## HALLMARK_TNFA_SIGNALING_VIA_NFKB   TUBB2A/KLF10/JUN/KLF4/EDN1/MXD1/CXCL1/TLR2/SLC2A3/IL23A/ZFP36/ZBTB10/PLEK/DRAM1/TIPARP/AREG/EGR1/FOSB/ICAM1/CCRL2/BTG2/DUSP1/MYC/BCL6/RELB/PNRC1/IFNGR2/PLK2/NFKBIE/PANX1/JAG1/CCNL1/IL1B/NR4A1/TNIP1/TRAF1/TNFAIP2/MARCKS/NINJ1/SAT1/JUNB/BTG1/IER3/ABCA1/MAFF/CEBPB/ZC3H12A/CXCL2/CLCF1/BCL2A1/IFIT2/PLPP3/TNFAIP6/PMEPA1/TRIP10/BHLHE40/CD44/ATP2B1/CCL20/PER1/IRF1/HES1/KLF2/ACKR3/NFKB1/NFKB2/VEGFA/GCH1/CXCL3/SMAD3/EHD1/SNN/INHBA/IL1A/RELA/PLAUR/CEBPD/CCL2/REL/NR4A2/CD83/DENND5A/PPP1R15A/FOS/SLC2A6/KDM6B/F3/PTX3/B4GALT1/IRS2/LAMB3/PTGS2/G0S2/IL6
## HALLMARK_HYPOXIA                                                                                                                      TPI1/STBD1/ILVBL/TKTL1/MYH9/PFKFB3/P4HA1/SLC25A1/SLC2A5/SIAH2/SLC6A6/JUN/ATP7A/ERO1A/FOXO3/CDKN1C/SLC2A3/ZFP36/EFNA3/GAA/P4HA2/TIPARP/MXI1/TMEM45A/DUSP1/KDM3A/CCNG2/ENO1/GPI/SELENBP1/PNRC1/PYGM/ANKZF1/STC1/PAM/GBE1/PPFIA4/ADM/ZNF292/NDRG1/BTG1/IER3/AK4/KLHL24/MAFF/WSB1/AKAP12/BNIP3L/ALDOC/ENO2/HK2/PFKP/PLAC8/HS3ST1/AMPD3/NR3C1/CA12/BHLHE40/VLDLR/ISG20/GYS1/SDC2/HAS1/PDK1/ACKR3/VEGFA/MAP3K1/ANGPTL4/PLIN2/PLAUR/GRHPR/PPP1R15A/FOS/F3/IRS2/IL6
## HALLMARK_INFLAMMATORY_RESPONSE                                                                                                                                                                                                 ICAM1/CCRL2/BTG2/IL18RAP/CXCL8/SELL/MYC/NMI/IFITM1/EBI3/IFNGR2/RHOG/IL1B/IL1R1/FZD5/ADM/IRAK2/ABCA1/SLC1A2/BEST1/IL4R/C5AR1/LAMP3/SCN1B/TNFAIP6/EREG/MARCO/MET/TNFSF15/CD40/ITGB8/NOD2/AQP9/ATP2B1/CCL20/ITGA5/TNFRSF1B/ICAM4/IRF1/NFKB1/TIMP1/GCH1/C3AR1/LYN/ADGRE1/CALCRL/NLRP3/FFAR2/LPAR1/CD48/INHBA/IL1A/CCL7/RELA/PLAUR/IRF7/CCL2/CLEC5A/F3/RNF144B/CSF3/IL6
## HALLMARK_INTERFERON_GAMMA_RESPONSE                                                                                                                            HERC6/IL15RA/IFIT3/NAMPT/TRIM25/ZNFX1/IL18BP/LY6E/IFIT1/PSMB8/FPR1/TXNIP/IFNAR2/FGL2/SLAMF7/TAPBP/IFITM2/PARP14/UPP1/CASP4/HLA-DQA1/ICAM1/OAS2/IFI44L/NMI/IFITM3/PML/IRF2/HELZ2/LGALS3BP/IRF9/MX1/STAT4/HLA-DRB1/TNFAIP2/BTG1/PARP12/GBP6/CD74/SP110/UBE2L6/GBP4/PFKP/APOL6/AUTS2/IL4R/CD38/IFIT2/PELI1/TNFAIP6/RBCK1/OAS3/CD274/EPSTI1/CD40/ISG20/IRF1/MX2/ZBP1/NFKB1/XAF1/GCH1/PTPN6/IRF8/CCL7/PLA2G4A/IRF7/CCL2/P2RY14/PTGS2/IL6
## HALLMARK_INTERFERON_ALPHA_RESPONSE                                                                                                                                                                                                                                                                                                                             IL7/HERC6/IFIT3/TRIM25/LY6E/PSMB8/TXNIP/SAMD9/IFITM2/PARP14/MVB12A/MOV10/CCRL2/SELL/IFI44L/NMI/IFITM3/IFITM1/IRF2/HELZ2/LGALS3BP/UBA7/IRF9/MX1/PARP12/TMEM140/CD74/SP110/UBE2L6/GBP4/IL4R/LAMP3/IFIT2/EPSTI1/ISG20/PARP9/IRF1/IRF7
## HALLMARK_GLYCOLYSIS                                                                                                                                                                                                                                                                                                                                                                            MXI1/ENO1/PMM2/ANG/ANKZF1/STC1/PAM/SLC25A10/GLCE/PPFIA4/QSOX1/ZNF292/IER3/AK4/ENO2/HK2/PFKP/CHST12/LHPP/RBCK1/PKM/MET/VLDLR/ISG20/CD44/GYS1/PLOD1/VCAN/SDC2/SPAG4/VEGFA/ANGPTL4/PLOD2/B4GALT1/IRS2
#plots
resGseaR.brca2.af<-resGsea.brca2.af@result[resGsea.brca2.af@result$p.adjust<0.05,]

p=ggplot(resGseaR.brca2.af, aes(x=ID, y=NES,fill=p.adjust)) + 
  geom_bar(stat = "identity") +
  scale_fill_continuous(low='red', high='blue')+
  coord_flip() +
  theme_bw()

p

# --- Negative NES Pathways ---
negative_pathways_brca2_af<-resGseaR.brca2.af[resGseaR.brca2.af$NES<0,]$ID
negative_pathways_brca2_af
##  [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"          
##  [2] "HALLMARK_HYPOXIA"                          
##  [3] "HALLMARK_INFLAMMATORY_RESPONSE"            
##  [4] "HALLMARK_INTERFERON_GAMMA_RESPONSE"        
##  [5] "HALLMARK_INTERFERON_ALPHA_RESPONSE"        
##  [6] "HALLMARK_GLYCOLYSIS"                       
##  [7] "HALLMARK_COMPLEMENT"                       
##  [8] "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"
##  [9] "HALLMARK_MITOTIC_SPINDLE"                  
## [10] "HALLMARK_UV_RESPONSE_DN"                   
## [11] "HALLMARK_IL6_JAK_STAT3_SIGNALING"          
## [12] "HALLMARK_KRAS_SIGNALING_UP"
# --- Positive NES Pathways ---
positive_pathways_brca2_af<-resGseaR.brca2.af[resGseaR.brca2.af$NES>0,]$ID

12.1.5 Comparisons

12.1.5.1 BRCA1.SA vs BRCA1.AF

It is interesting to study the comparison between BRCA1 of healthy and affected samples.

Let’s retrieve a list of positive pathways shared between BRCA1 healthy and BRCA1 affected:

library(VennDiagram)
intersect(positive_pathways_brca1_sa,positive_pathways_brca1_af)
## [1] "HALLMARK_MYC_TARGETS_V1" "HALLMARK_E2F_TARGETS"   
## [3] "HALLMARK_MYC_TARGETS_V2"
venn.diagram(
  x = list(positive_pathways_brca1_sa, positive_pathways_brca1_af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFFECTED"),
  filename = 'hall_positive_venn_diagramm_brca1_vs_brca1_50filt.png',
  output = TRUE,
  main = 'Positive BRCA1 Healthy vs Positive BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27, 160),cat.dist= c(0.02,-0.05))
## [1] 1

Negative pathways shaded between BRCA1 healthy and BRCA1 affected:

intersect(negative_pathways_brca1_sa,negative_pathways_brca1_af)
## [1] "HALLMARK_INFLAMMATORY_RESPONSE"            
## [2] "HALLMARK_HYPOXIA"                          
## [3] "HALLMARK_ANGIOGENESIS"                     
## [4] "HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"
## [5] "HALLMARK_KRAS_SIGNALING_UP"                
## [6] "HALLMARK_COAGULATION"
venn.diagram(
  x = list(negative_pathways_brca1_sa, negative_pathways_brca1_af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFFECTED"),
  filename = 'hall_negative_venn_diagramm_brca1_vs_brca1_50filt.png',
  output = TRUE,
  main = 'Negative BRCA1 Healthy vs Negative BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27, 160),cat.dist= c(0.02,-0.05))
## [1] 1

Positive BRCA1 healthy pathways that are negative in BRCA1 affected

intersect(positive_pathways_brca1_sa,negative_pathways_brca1_af)
## character(0)
venn.diagram(
  x = list(positive_pathways_brca1_sa, negative_pathways_brca1_af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFECTED"),
  filename = 'hall_pos_neg_venn_diagramm_brca1sa_vs_brca1af_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA1 Healthy vs BRCA1 Afected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"))
## [1] 1

There are no pathways shared between positive BRCA1 healthy and negative BRCA1 affected.

Negative BRCA1 healthy pathways that are positive in BRCA1 affected

intersect(negative_pathways_brca1_sa,positive_pathways_brca2_af)
## character(0)
venn.diagram(
  x = list(negative_pathways_brca1_sa, positive_pathways_brca2_af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFFECTED"),
  filename = 'msigdb_neg_pos_venn_diagramm_brca1sa_vs_brca1af_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA1 Healthy vs BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"))
## [1] 1

There are no pathways shared between negative BRCA1 healthy and positive BRCA1 affected.

12.1.5.2 BRCA2.SA vs BRCA2.AF

And finally, for the hallmark gene set collection, we will study the comparisons between BRCA2 healthy and BRCA2 affected samples.

Positive pathways shared between BRCA2 healthy and BRCA2 affected:

intersect(positive_pathways_brca2_sa,positive_pathways_brca2_af)
## character(0)
venn.diagram(
  x = list(positive_pathways_brca2_sa, positive_pathways_brca2_af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'hallmark_positive_venn_diagramm_brca2_vs_brca2_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA2 Healthy vs BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"))
## [1] 1

No positive pathways shared between BRCA2 healthy and affected.

Negative pathways shaded between brca2 healthy and brca2 afected:

intersect(negative_pathways_brca2_sa,negative_pathways_brca2_af)
## [1] "HALLMARK_TNFA_SIGNALING_VIA_NFKB"   "HALLMARK_INFLAMMATORY_RESPONSE"    
## [3] "HALLMARK_HYPOXIA"                   "HALLMARK_COMPLEMENT"               
## [5] "HALLMARK_INTERFERON_GAMMA_RESPONSE" "HALLMARK_KRAS_SIGNALING_UP"
venn.diagram(
  x = list(negative_pathways_brca2_sa, negative_pathways_brca2_af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'hallmark_negative_venn_diagramm_brca2_vs_brca2_50filt.png',
  output = TRUE,
  main = 'Venn diagram negative BRCA2 Healthy vs BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27, 160),cat.dist= c(0.02,-0.05))
## [1] 1

Positive BRCA2 healthy pathways that are negative in BRCA2 affected:

intersect(positive_pathways_brca2_sa,negative_pathways_brca2_af)
## character(0)
venn.diagram(
  x = list(positive_pathways_brca2_sa, negative_pathways_brca2_af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'hallmark_pos_neg_venn_diagramm_brca2_vs_brca2_50filt.png',
  output = TRUE,
  main = 'Positive BRCA2 Healthy vs Negative BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27, 160),cat.dist= c(0.02,-0.05))
## [1] 1

No pathways shared between positive BRCA2 healthy and negative in BRCA2 affected.

Negative BRCA2 healthy pathways that are positive in BRCA2 affected:

intersect(negative_pathways_brca2_sa,positive_pathways_brca2_af)
## character(0)
venn.diagram(
  x = list(negative_pathways_brca2_sa, positive_pathways_brca2_af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'hallmark_neg_pos_venn_diagramm_brca2sa_vs_brca2af_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA2 Healthy vs BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"))
## [1] 1

No negative pathways of BRCA2 healthy shared with the positive pathways of BRA2 affected.

12.2 GSEA - mSigDB - Reactome

This helper function shows the available collections.

msigdbr_collections()
## # A tibble: 23 × 3
##    gs_cat gs_subcat         num_genesets
##    <chr>  <chr>                    <int>
##  1 C1     ""                         299
##  2 C2     "CGP"                     3384
##  3 C2     "CP"                        29
##  4 C2     "CP:BIOCARTA"              292
##  5 C2     "CP:KEGG"                  186
##  6 C2     "CP:PID"                   196
##  7 C2     "CP:REACTOME"             1615
##  8 C2     "CP:WIKIPATHWAYS"          664
##  9 C3     "MIR:MIRDB"               2377
## 10 C3     "MIR:MIR_Legacy"           221
## # ℹ 13 more rows

We will select the Reactome subset of canonical patwhays within the C2 collection. There are 1654 gene sets, so we expect to see a higher number of gene sets in our analysis compared with the hallamrk.

library(msigdbr)

reactome_gene_sets = msigdbr(species = "Homo sapiens", category = "C2", subcategory = 'CP:REACTOME')
R.entrez <- reactome_gene_sets %>% dplyr::select(gs_name, entrez_gene)
R.symbol <- reactome_gene_sets %>% dplyr::select(gs_name, gene_symbol)

12.2.1 BRCA1.SA VS NOMUT.SA

set.seed(123)
react.brca1.sa<- GSEA(geneList_brca1.sa, TERM2GENE = R.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
react.brca1.sa<-react.brca1.sa@result[react.brca1.sa@result$p.adjust<0.05,]
head(react.brca1.sa)
##                                                                    ID
## REACTOME_TRANSLATION                             REACTOME_TRANSLATION
## REACTOME_SYNTHESIS_OF_DNA                   REACTOME_SYNTHESIS_OF_DNA
## REACTOME_G2_M_CHECKPOINTS                   REACTOME_G2_M_CHECKPOINTS
## REACTOME_CELL_CYCLE_CHECKPOINTS       REACTOME_CELL_CYCLE_CHECKPOINTS
## REACTOME_MITOCHONDRIAL_TRANSLATION REACTOME_MITOCHONDRIAL_TRANSLATION
## REACTOME_S_PHASE                                     REACTOME_S_PHASE
##                                                           Description setSize
## REACTOME_TRANSLATION                             REACTOME_TRANSLATION     139
## REACTOME_SYNTHESIS_OF_DNA                   REACTOME_SYNTHESIS_OF_DNA      82
## REACTOME_G2_M_CHECKPOINTS                   REACTOME_G2_M_CHECKPOINTS      86
## REACTOME_CELL_CYCLE_CHECKPOINTS       REACTOME_CELL_CYCLE_CHECKPOINTS     165
## REACTOME_MITOCHONDRIAL_TRANSLATION REACTOME_MITOCHONDRIAL_TRANSLATION      73
## REACTOME_S_PHASE                                     REACTOME_S_PHASE     107
##                                    enrichmentScore      NES pvalue p.adjust
## REACTOME_TRANSLATION                     0.6779840 2.840981  1e-10  4.4e-09
## REACTOME_SYNTHESIS_OF_DNA                0.6949890 2.709474  1e-10  4.4e-09
## REACTOME_G2_M_CHECKPOINTS                0.6867043 2.693896  1e-10  4.4e-09
## REACTOME_CELL_CYCLE_CHECKPOINTS          0.6204021 2.687386  1e-10  4.4e-09
## REACTOME_MITOCHONDRIAL_TRANSLATION       0.7031058 2.662734  1e-10  4.4e-09
## REACTOME_S_PHASE                         0.6548499 2.651480  1e-10  4.4e-09
##                                          qvalue rank
## REACTOME_TRANSLATION               2.807018e-09 2282
## REACTOME_SYNTHESIS_OF_DNA          2.807018e-09 2867
## REACTOME_G2_M_CHECKPOINTS          2.807018e-09 2345
## REACTOME_CELL_CYCLE_CHECKPOINTS    2.807018e-09 2362
## REACTOME_MITOCHONDRIAL_TRANSLATION 2.807018e-09 2348
## REACTOME_S_PHASE                   2.807018e-09 2867
##                                                      leading_edge
## REACTOME_TRANSLATION               tags=65%, list=21%, signal=53%
## REACTOME_SYNTHESIS_OF_DNA          tags=80%, list=26%, signal=60%
## REACTOME_G2_M_CHECKPOINTS          tags=69%, list=21%, signal=54%
## REACTOME_CELL_CYCLE_CHECKPOINTS    tags=62%, list=22%, signal=50%
## REACTOME_MITOCHONDRIAL_TRANSLATION tags=71%, list=21%, signal=56%
## REACTOME_S_PHASE                   tags=75%, list=26%, signal=56%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  core_enrichment
## REACTOME_TRANSLATION                        MRPL42/MARS2/MRPL18/MRPL32/MRPL14/GFM1/EIF2B3/SRPRB/MRPL22/EIF2S1/EIF2S2/MRPS23/WARS2/DARS2/MRPL3/EIF3C/MRPS9/MRPS17/EEF1E1/MRPL35/MRPL47/MRPS35/MRPS28/MRPL17/EIF2B1/MRPS15/MRPL1/RPS23/MRPL24/EIF2B2/MRPL21/AIMP2/EIF3I/MRPL15/ETF1/LARS2/AIMP1/RPL26L1/MRPS12/MRPL45/GFM2/EIF3J/MRPL48/MRPS30/MRPS33/SEC11C/MRPS22/EIF3B/MRPL27/VARS2/MRPS7/MRPS34/MRPL9/MRPL16/RPL14/RPS21/MRPS18C/RPS11/MRPL46/MRPL37/MRPL19/MRPS21/MRPL51/TSFM/SRP19/RPL18A/EIF5B/RPL27A/TUFM/MRPL11/FARSA/PPA1/YARS2/EEF1D/RPL23/MRPL12/MRPL57/EIF4EBP1/MRPS24/SEC61A1/MRPS10/MRPL44/MRPS16/RPL38/FARSB/MTIF2/MRPL36/RPL27/MRPS26/EIF3H/MTFMT
## REACTOME_SYNTHESIS_OF_DNA                                                                                                                                                                                                                                            PSMA5/PSMD14/RFC4/PSMC1/PSMC3/PSMD11/PSMA1/PSMA8/POLD3/ORC5/CDC27/GINS3/ORC2/PSMA2/POLA1/ORC3/PSME3/RFC3/SKP2/PSMF1/LIG1/POLE3/POLE2/PSMC2/ANAPC1/FEN1/POLA2/ORC4/MCM6/ANAPC15/ORC1/CDC6/MCM3/PSMB6/MCM7/PSMC5/PSMD1/PRIM2/PSMD12/PSMD13/UBE2C/PSMC6/PSMD2/RFC5/CCNE1/PCNA/PSMB5/CDC45/MCM4/PSMB7/PSMA7/PSMC4/CCNA2/MCM2/PRIM1/GINS1/DNA2/PSMB3/ORC6/GINS4/RFC2/CDT1/ANAPC7/RPA3/MCM5/GINS2
## REACTOME_G2_M_CHECKPOINTS                                                                                                                                                                                                                                                                                         PSMA5/PSMD14/RFC4/PSMC1/PSMC3/PSMD11/PSMA1/PSMA8/ORC5/TOPBP1/ORC2/PSMA2/ORC3/RBBP8/BLM/PSME3/RFC3/RAD1/EXO1/PSMF1/CCNB1/CHEK1/RMI2/DBF4/BRCC3/PSMC2/ORC4/CLSPN/MCM6/ORC1/CDC6/MCM3/PSMB6/MCM7/PSMC5/PSMD1/PSMD12/PSMD13/CDC7/PSMC6/PSMD2/RFC5/BARD1/PSMB5/CDK1/WEE1/CDC45/MCM10/CDC25A/MCM4/PSMB7/PSMA7/PSMC4/MCM2/UBE2V2/DNA2/RMI1/PSMB3/ORC6
## REACTOME_CELL_CYCLE_CHECKPOINTS    KIF18A/PSMA5/CENPE/PSMD14/RFC4/CENPO/PSMC1/PSMC3/PSMD11/PSMA1/PSMA8/NUP107/ZWILCH/NUP85/SPDL1/RCC2/ORC5/CDC27/PPP1CC/ERCC6L/TOPBP1/CENPN/SKA1/NDC80/ITGB3BP/ORC2/PSMA2/CENPI/CDC20/ORC3/SKA2/RBBP8/BLM/PSME3/PMF1/RFC3/BUB1/RAD1/EXO1/PSMF1/CENPF/KNTC1/CCNB1/NUDC/CHEK1/RMI2/KIF2C/CENPU/DBF4/BRCC3/PSMC2/ANAPC1/CENPP/DYNLL2/AHCTF1/ZWINT/ORC4/CLSPN/BIRC5/MCM6/ANAPC15/ORC1/CDC6/MCM3/PSMB6/MCM7/PSMC5/PSMD1/PSMD12/BUB1B/PSMD13/UBE2C/CDC7/NUF2/MAD2L1/PSMC6/PSMD2/SPC25/RFC5/CCNE1/NUP160/BARD1/PSMB5/CDK1/WEE1/CDC45/MCM10/CDC25A/MCM4/PSMB7/PSMA7/NUP37/PSMC4/NUP43/CCNA2/MCM2/UBE2V2/DNA2/RMI1/KIF2A/PSMB3/ORC6/AURKB
## REACTOME_MITOCHONDRIAL_TRANSLATION                                                                                                                                                                                                                                                                 MRPL42/MRPL18/MRPL32/MRPL14/GFM1/MRPL22/MRPS23/MRPL3/MRPS9/MRPS17/MRPL35/MRPL47/MRPS35/MRPS28/MRPL17/MRPS15/MRPL1/MRPL24/MRPL21/MRPL15/MRPS12/MRPL45/GFM2/MRPL48/MRPS30/MRPS33/MRPS22/MRPL27/MRPS7/MRPS34/MRPL9/MRPL16/MRPS18C/MRPL46/MRPL37/MRPL19/MRPS21/MRPL51/TSFM/TUFM/MRPL11/MRPL12/MRPL57/MRPS24/MRPS10/MRPL44/MRPS16/MTIF2/MRPL36/MRPS26/MTFMT/MRPL50
## REACTOME_S_PHASE                                                                                                                                                                       PSMA5/CDK7/PSMD14/RFC4/PSMC1/PSMC3/PSMD11/PSMA1/PSMA8/POLD3/ORC5/CDC27/GINS3/ORC2/CDK4/PSMA2/POLA1/ORC3/MNAT1/LIN54/PSME3/RFC3/SKP2/PSMF1/LIG1/POLE3/POLE2/CDCA5/PSMC2/ANAPC1/FEN1/POLA2/ORC4/MCM6/ANAPC15/ORC1/CDC6/MCM3/PSMB6/MCM7/ESCO1/PSMC5/PSMD1/PRIM2/PSMD12/PSMD13/CKS1B/UBE2C/PSMC6/PSMD2/RFC5/CCNE1/PCNA/PSMB5/MYC/WEE1/CDC45/CDC25A/MCM4/PSMB7/PSMA7/PSMC4/CCNA2/MCM2/LIN52/PRIM1/GINS1/DNA2/PSMB3/ORC6/E2F1/GINS4/TFDP2/RFC2/CDT1/ANAPC7/RPA3/SMC3/MCM5/GINS2
#positive NES
positive_NES_reactome_brca1.sa<-react.brca1.sa[react.brca1.sa$NES>0,]$ID
length(positive_NES_reactome_brca1.sa)
## [1] 185
#negative NES
negative_NES_reactome_brca1.sa<-react.brca1.sa[react.brca1.sa$NES<0,]$ID
length(negative_NES_reactome_brca1.sa)
## [1] 91

12.2.2 BRCA1.AF vs NOMUT.AF

set.seed(123)
react.brca1.af <- GSEA(geneList_brca1_af, TERM2GENE = R.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
react.brca1.af<-react.brca1.af@result[react.brca1.af@result$p.adjust<0.05,]
head(react.brca1.af)
##                                                                                    ID
## REACTOME_NEUTROPHIL_DEGRANULATION                   REACTOME_NEUTROPHIL_DEGRANULATION
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION
## REACTOME_METABOLISM_OF_RNA                                 REACTOME_METABOLISM_OF_RNA
##                                                                           Description
## REACTOME_NEUTROPHIL_DEGRANULATION                   REACTOME_NEUTROPHIL_DEGRANULATION
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION
## REACTOME_METABOLISM_OF_RNA                                 REACTOME_METABOLISM_OF_RNA
##                                            setSize enrichmentScore       NES
## REACTOME_NEUTROPHIL_DEGRANULATION              329      -0.3712417 -1.730702
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION     145      -0.4142891 -1.745181
## REACTOME_METABOLISM_OF_RNA                     370       0.3529883  1.551548
##                                                  pvalue     p.adjust
## REACTOME_NEUTROPHIL_DEGRANULATION          5.604194e-07 0.0005178275
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION 7.144414e-05 0.0282190022
## REACTOME_METABOLISM_OF_RNA                 9.162014e-05 0.0282190022
##                                                  qvalue rank
## REACTOME_NEUTROPHIL_DEGRANULATION          0.0004955287 2387
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION 0.0270038299 2337
## REACTOME_METABOLISM_OF_RNA                 0.0270038299 3344
##                                                              leading_edge
## REACTOME_NEUTROPHIL_DEGRANULATION          tags=39%, list=22%, signal=31%
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION tags=42%, list=21%, signal=34%
## REACTOME_METABOLISM_OF_RNA                 tags=38%, list=31%, signal=27%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    core_enrichment
## REACTOME_NEUTROPHIL_DEGRANULATION                                                                                            PTX3/AGL/SERPINA1/ALDOC/IST1/NCKAP1L/MMP9/TMEM63A/CD59/NPC2/CKAP4/NFAM1/DERA/CRISPLD2/PTPRC/P2RX1/B4GALT1/DDX3X/GDI2/VAT1/GLB1/ASAH1/TNFRSF1B/IGF2R/ITGB2/HEBP2/TIMP2/CYSTM1/PGLYRP1/MAPK14/CYFIP1/PLEKHO2/CLEC4D/S100P/SLC11A1/TYROBP/CTSH/AMPD3/CD55/LGALS3/CSTB/CXCL1/GPI/DEFA4/RNASE2/GLA/PRSS2/CD53/RAB3D/PSMD13/ALDH3B1/PECAM1/CD63/FPR2/A1BG/CFP/PLAUR/DEGS1/FCER1G/CD300A/TMEM30A/SERPINA3/HPSE/SIGLEC9/YPEL5/CD36/CEACAM6/HSPA1A/ADAM10/CD44/SNAP23/FCGR2A/RAB9B/ACTR2/CHI3L1/GAA/FPR1/HBB/AZU1/RETN/ATG7/CAB39/ANPEP/OLR1/DEFA1/PKP1/MCEMP1/ANO6/CEACAM1/LAMP1/LRG1/CD14/GNS/MS4A3/HSPA6/CTSB/CEACAM8/SLC2A3/CD93/S100A12/CLEC5A/S100A8/PRG2/RNASE3/LILRB2/IQGAP1/SIGLEC5/CPPED1/C3/ITGAV/MGAM/CPNE3/SYNGR1/S100A9/PLD1/GMFG/ATP8A1/ANXA2/QPCT/STBD1/PTPRJ/LYZ/FTH1/GLIPR1/FCAR/LILRA3/TCN1
## REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  EMILIN1/MMP9/ICAM2/ITGA1/ITGA6/EMILIN3/ADAMTS14/PRKCA/NTN4/CAPN7/ITGB2/TIMP2/JAM3/ITGB8/COL16A1/DSPP/MFAP3/PRSS2/ITGA4/CTRB2/APP/COL1A1/EMILIN2/PECAM1/ADAMTS16/ITGB1/P4HA2/F11R/LOXL1/CEACAM6/ADAM10/CD44/P4HB/P4HA1/COL6A3/CDH1/ITGAE/MMP2/TIMP1/VTN/COL23A1/HTRA1/GDF5/CEACAM1/CTSB/CEACAM8/ADAM17/ITGB3/THBS1/FGF2/MMP3/ITGAV/PXDN/VCAN/LAMB3/PLOD2/TPSAB1/SDC2/FBLN5/ADAM12/ADAM9
## REACTOME_METABOLISM_OF_RNA                 RRP7A/KHSRP/RPL3L/TRMT11/CPSF1/SMG5/GTF2F2/APOBEC1/SMG7/PSMD3/SUPT5H/SENP3/HNRNPL/MPHOSPH10/APOBEC3A/SF3A2/NOP2/SNRNP70/UPF2/GTF2H4/HNRNPH1/LAS1L/NUP62/CPSF7/TRMT6/DDX21/CNOT3/PUS3/PNO1/AKT1/RBM28/QTRT1/TNFSF13/TNKS1BP1/CWC22/WDR43/NSUN2/RNMT/PSMD1/XAB2/PABPN1/SF3B4/DDX1/PCBP1/SNUPN/DHX37/RIOK2/NT5C3B/XPOT/TRMT1/TSR1/NUP210/PELP1/MPHOSPH6/WDR77/RTCB/NOB1/ALKBH8/HEATR1/IMP4/BOP1/NUP85/BMS1/UTP15/METTL1/ZFP36L1/PAIP1/FUS/RIOK1/TYW3/GNL3/HNRNPUL1/CDK7/FTSJ1/BCAS2/DDX20/PSMD12/DCPS/PSME3/THUMPD1/PRCC/NOC4L/RPL39L/PSMB6/RPP25/WBP4/YWHAB/BYSL/SNRNP25/SF3B3/SNRPA1/TRNT1/PPWD1/DUS2/POM121C/NOL6/DDX49/PRPF8/NIP7/EDC3/RAN/NCL/EXOSC9/DKC1/DDX39A/TXNL4A/APOBEC3C/LTV1/PDCD11/DHX38/GEMIN2/MRM1/UTP4/NCBP2/ADAT2/PSMA7/PSMD14/PUS1/LSM8/SRSF10/CDKAL1/WDR75/DCP1B/PSMA5/GAR1/PCBP2/PUF60/GTF2H3/WDR36/PHAX/SNRPD3/NUP205/FIP1L1/PUS7/MAPKAPK2/CSTF2/TRMT13/RRP9/GEMIN5
#positive
positive_NES_reactome_brca1.af<-react.brca1.af[react.brca1.af$NES>0,]$ID
length(positive_NES_reactome_brca1.af)
## [1] 1
#negative NES
negative_NES_reactome_brca1.af<-react.brca1.af[react.brca1.af$NES<0,]$ID
length(negative_NES_reactome_brca1.af)
## [1] 2

12.2.3 BRCA2.SA VS NOMUT.SA

set.seed(123)
react.brca2.sa <- GSEA(geneList_brca2_sa, TERM2GENE = R.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
react.brca2.sa<-react.brca2.sa@result[react.brca2.sa@result$p.adjust<0.05,]
head(react.brca2.sa)
##                                                                    ID
## REACTOME_TRANSLATION                             REACTOME_TRANSLATION
## REACTOME_MITOCHONDRIAL_TRANSLATION REACTOME_MITOCHONDRIAL_TRANSLATION
## REACTOME_SYNTHESIS_OF_DNA                   REACTOME_SYNTHESIS_OF_DNA
## REACTOME_S_PHASE                                     REACTOME_S_PHASE
## REACTOME_ADAPTIVE_IMMUNE_SYSTEM       REACTOME_ADAPTIVE_IMMUNE_SYSTEM
## REACTOME_CELL_CYCLE_CHECKPOINTS       REACTOME_CELL_CYCLE_CHECKPOINTS
##                                                           Description setSize
## REACTOME_TRANSLATION                             REACTOME_TRANSLATION     139
## REACTOME_MITOCHONDRIAL_TRANSLATION REACTOME_MITOCHONDRIAL_TRANSLATION      73
## REACTOME_SYNTHESIS_OF_DNA                   REACTOME_SYNTHESIS_OF_DNA      82
## REACTOME_S_PHASE                                     REACTOME_S_PHASE     107
## REACTOME_ADAPTIVE_IMMUNE_SYSTEM       REACTOME_ADAPTIVE_IMMUNE_SYSTEM     449
## REACTOME_CELL_CYCLE_CHECKPOINTS       REACTOME_CELL_CYCLE_CHECKPOINTS     165
##                                    enrichmentScore       NES       pvalue
## REACTOME_TRANSLATION                     0.5263538  2.155525 7.881199e-10
## REACTOME_MITOCHONDRIAL_TRANSLATION       0.6230893  2.292408 7.166966e-09
## REACTOME_SYNTHESIS_OF_DNA                0.5911639  2.218839 7.578185e-09
## REACTOME_S_PHASE                         0.5342948  2.096631 5.475571e-08
## REACTOME_ADAPTIVE_IMMUNE_SYSTEM         -0.3442244 -1.663044 8.558456e-08
## REACTOME_CELL_CYCLE_CHECKPOINTS          0.4594296  1.921238 4.040332e-07
##                                        p.adjust       qvalue rank
## REACTOME_TRANSLATION               7.282228e-07 6.147335e-07 2865
## REACTOME_MITOCHONDRIAL_TRANSLATION 2.334081e-06 1.970328e-06 2865
## REACTOME_SYNTHESIS_OF_DNA          2.334081e-06 1.970328e-06 2870
## REACTOME_S_PHASE                   1.264857e-05 1.067736e-05 2770
## REACTOME_ADAPTIVE_IMMUNE_SYSTEM    1.581603e-05 1.335119e-05 2275
## REACTOME_CELL_CYCLE_CHECKPOINTS    6.222111e-05 5.252431e-05 2770
##                                                      leading_edge
## REACTOME_TRANSLATION               tags=55%, list=26%, signal=41%
## REACTOME_MITOCHONDRIAL_TRANSLATION tags=68%, list=26%, signal=51%
## REACTOME_SYNTHESIS_OF_DNA          tags=61%, list=26%, signal=45%
## REACTOME_S_PHASE                   tags=53%, list=25%, signal=40%
## REACTOME_ADAPTIVE_IMMUNE_SYSTEM    tags=29%, list=21%, signal=24%
## REACTOME_CELL_CYCLE_CHECKPOINTS    tags=52%, list=25%, signal=40%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             core_enrichment
## REACTOME_TRANSLATION                                                                                                                                                                                                                                                                                                                                                     MRPL18/MRPL16/MRPL51/MTFMT/RPS23/MRPL48/MRPL1/MRPS7/MRPL32/MRPS33/FARS2/MRPS35/VARS2/EEF1E1/MRPL3/MARS2/MRPL14/MRPS12/MRPL21/GFM1/MRPL15/MRPS22/DARS2/MRPS34/EIF2B3/MRPL37/LARS2/MRPL45/MRRF/MRPS23/TSFM/MRPL9/AIMP2/GSPT2/SRPRB/MRPS15/MRPL24/MRPL17/EIF2B2/MRPL50/FARSA/MRPS24/MRPL35/MRPL22/EIF3J/EIF2S1/MRPS26/MRPL34/EIF3I/RPL26L1/MRPL11/EIF2B1/EIF1AX/MRPS16/MRPL36/YARS2/GFM2/MRPS28/EIF3H/RPL3L/MRPS18A/MRPL46/MRPS18B/MRPS30/MTIF2/PPA1/MRPS17/MRPL47/MRPL44/MRPL23/MRPL43/RPL22L1/EIF3L/MRPL4/ETF1/TUFM
## REACTOME_MITOCHONDRIAL_TRANSLATION                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           MRPL18/MRPL16/MRPL51/MTFMT/MRPL48/MRPL1/MRPS7/MRPL32/MRPS33/MRPS35/MRPL3/MRPL14/MRPS12/MRPL21/GFM1/MRPL15/MRPS22/MRPS34/MRPL37/MRPL45/MRRF/MRPS23/TSFM/MRPL9/MRPS15/MRPL24/MRPL17/MRPL50/MRPS24/MRPL35/MRPL22/MRPS26/MRPL34/MRPL11/MRPS16/MRPL36/GFM2/MRPS28/MRPS18A/MRPL46/MRPS18B/MRPS30/MTIF2/MRPS17/MRPL47/MRPL44/MRPL23/MRPL43/MRPL4/TUFM
## REACTOME_SYNTHESIS_OF_DNA                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    RFC3/UBE2C/PSMC3/SKP2/GINS2/MCM4/CCNE1/FEN1/PSMB5/ORC1/PSMB3/PSMA1/MCM2/RFC4/POLE2/GINS3/POLD3/CCNE2/MCM7/PSMA7/RFC5/PCNA/CDT1/PSME3/PSMA8/CDC6/CCNA2/PSMB6/RPA3/MCM3/PSMD14/PSMD11/ORC3/DNA2/ANAPC15/CDC45/PRIM2/POLA1/POLA2/POLE3/PRIM1/PSMD1/PSMC6/PSMC2/PSMD2/ORC4/PSMA2/UBE2S/PSMC5/PSMC4
## REACTOME_S_PHASE                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       RFC3/UBE2C/PSMC3/SKP2/CDCA5/GINS2/MCM4/CDK4/CCNE1/FEN1/PSMB5/ORC1/PSMB3/PSMA1/MCM2/CDC25A/RFC4/POLE2/GINS3/POLD3/CCNE2/MCM7/PSMA7/RFC5/PCNA/CKS1B/CDT1/PSME3/PSMA8/MYC/CDC6/TFDP1/CCNA2/WEE1/PSMB6/RPA3/E2F1/MCM3/PSMD14/PSMD11/ORC3/DNA2/ANAPC15/CDC45/PRIM2/POLA1/POLA2/POLE3/PRIM1/PSMD1/PSMC6/PSMC2/PSMD2/ORC4/PSMA2/UBE2S/PSMC5
## REACTOME_ADAPTIVE_IMMUNE_SYSTEM    KLHL22/ITGB7/HACE1/BLNK/FBXL5/PRKACB/KCTD7/SH2D1A/IKBKB/AKT1/HERC1/TRIM41/CD81/CDH1/BTN3A1/VAMP3/ZNRF1/KIF3A/CTSC/NCR1/PILRB/PTPRC/SEC22B/LAIR1/PTPN22/UBE2V2/YES1/PPP3CA/CD8A/PJA2/HLA-DMA/WAS/CBLB/BTN1A1/HLA-E/SOS1/AP2A1/LNPEP/IGHD/TUBB4A/STIM1/CD200R1/LILRA6/CD80/LTN1/CRTAM/CTSH/UBA1/SOCS3/BLK/CD74/KLRB1/PPP2R5C/PIK3AP1/RAP1GAP2/CTSS/ERAP2/BTN3A2/PIK3CB/PAG1/TLR4/ITPR1/LILRB2/LAG3/TRIM11/LILRA4/ASB2/LILRA3/SIGLEC9/CD33/MICA/HLA-DPB1/TYROBP/DZIP3/SIGLEC10/LILRB1/NCR3LG1/NFKBIE/HLA-DQA1/DAPP1/ITGAV/CD96/UBE2R2/ICAM4/HLA-DQB2/HLA-DQB1/SEC24A/MICB/RNF19B/TLR2/ASB8/PRR5/HSPA5/RNF19A/PTEN/EVL/FBXL15/HLA-F/ZBTB16/CD1C/WSB1/MAP3K8/LRSAM1/SIGLEC7/TLR6/RELA/RICTOR/MAP3K14/PPP2R5B/CD300C/HLA-DRB1/RNF144B/NCF1/KLRC1/RASGRP1/UBE2Z/INPP5D/RNF213/TUBB6/NCF4/OSCAR/SEC24D/SEC61B/REL/CD300A/MYLIP/FBXL8/ICAM1/TREM1/RIPK2/SYK/KLHL5
## REACTOME_CELL_CYCLE_CHECKPOINTS                                                                                                                                                                                                                                                                                                                                 SFN/RFC3/ERCC6L/CENPP/MAD2L1/UBE2C/PSMC3/CENPO/CENPN/BUB1/AURKB/MCM4/CCNE1/RMI2/ZWILCH/PSMB5/MCM10/SKA1/CLSPN/ORC1/PSMB3/NUP107/SPC24/ZWINT/NDC80/PSMA1/MCM2/RMI1/CDC20/BIRC5/CDC25A/RFC4/SPC25/NUF2/PMF1/CCNE2/CENPA/MCM7/PSMA7/CCNB1/RFC5/CDK1/CHEK1/PSME3/PSMA8/CENPH/CDC6/EXO1/ITGB3BP/CCNA2/KIF18A/INCENP/NUP85/WEE1/BRCA1/PSMB6/RPA3/DYNLL2/MCM3/KIF2C/PSMD14/PSMD11/ORC3/DNA2/ANAPC15/CDC45/BRCC3/CENPF/SEH1L/NUP43/CENPM/PSMD1/RAD1/PSMC6/PPP2R5A/PSMC2/CENPU/PSMD2/TOPBP1/ORC4/PPP1CC/CDCA8/PSMA2/UBE2S/NUDC/PSMC5
#positive NES
positive_NES_reactome_brca2.sa<-react.brca2.sa[react.brca2.sa$NES>0,]$ID
length(positive_NES_reactome_brca2.sa)
## [1] 56
#negative NES
negative_NES_reactome_brca2.sa<-react.brca2.sa[react.brca2.sa$NES<0,]$ID
length(negative_NES_reactome_brca2.sa)
## [1] 25

12.2.4 BRCA2.AF vs NOMUT.AF

set.seed(123)
react.brca2.af <- GSEA(geneList_brca2_af, TERM2GENE = R.symbol)
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
react.brca2.af<-react.brca2.af@result[react.brca2.af@result$p.adjust<0.05,]
head(react.brca2.af)
##                                                                                                                                                                                  ID
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS
## REACTOME_NEUTROPHIL_DEGRANULATION                                                                                                                 REACTOME_NEUTROPHIL_DEGRANULATION
## REACTOME_HEMOSTASIS                                                                                                                                             REACTOME_HEMOSTASIS
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                                                                           REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL                 REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL
## REACTOME_INTERFERON_SIGNALING                                                                                                                         REACTOME_INTERFERON_SIGNALING
##                                                                                                                                                                         Description
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS
## REACTOME_NEUTROPHIL_DEGRANULATION                                                                                                                 REACTOME_NEUTROPHIL_DEGRANULATION
## REACTOME_HEMOSTASIS                                                                                                                                             REACTOME_HEMOSTASIS
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                                                                           REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL                 REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL
## REACTOME_INTERFERON_SIGNALING                                                                                                                         REACTOME_INTERFERON_SIGNALING
##                                                                                           setSize
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS      33
## REACTOME_NEUTROPHIL_DEGRANULATION                                                             329
## REACTOME_HEMOSTASIS                                                                           367
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                                  457
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL             113
## REACTOME_INTERFERON_SIGNALING                                                                 128
##                                                                                           enrichmentScore
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS      -0.7581039
## REACTOME_NEUTROPHIL_DEGRANULATION                                                              -0.3879041
## REACTOME_HEMOSTASIS                                                                            -0.3829374
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                                   -0.3404789
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL              -0.4730615
## REACTOME_INTERFERON_SIGNALING                                                                  -0.4640274
##                                                                                                 NES
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS -2.353314
## REACTOME_NEUTROPHIL_DEGRANULATION                                                         -1.742677
## REACTOME_HEMOSTASIS                                                                       -1.736192
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              -1.571489
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL         -1.861215
## REACTOME_INTERFERON_SIGNALING                                                             -1.844883
##                                                                                                 pvalue
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS 1.475050e-08
## REACTOME_NEUTROPHIL_DEGRANULATION                                                         2.792401e-07
## REACTOME_HEMOSTASIS                                                                       2.159802e-07
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              4.104294e-06
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL         1.037963e-05
## REACTOME_INTERFERON_SIGNALING                                                             9.407337e-06
##                                                                                               p.adjust
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS 1.362946e-05
## REACTOME_NEUTROPHIL_DEGRANULATION                                                         8.600596e-05
## REACTOME_HEMOSTASIS                                                                       8.600596e-05
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              9.480918e-04
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL         1.598462e-03
## REACTOME_INTERFERON_SIGNALING                                                             1.598462e-03
##                                                                                                 qvalue
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS 1.180040e-05
## REACTOME_NEUTROPHIL_DEGRANULATION                                                         7.446403e-05
## REACTOME_HEMOSTASIS                                                                       7.446403e-05
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              8.208587e-04
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL         1.383950e-03
## REACTOME_INTERFERON_SIGNALING                                                             1.383950e-03
##                                                                                           rank
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS 1156
## REACTOME_NEUTROPHIL_DEGRANULATION                                                         2316
## REACTOME_HEMOSTASIS                                                                       2019
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              2025
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL         1808
## REACTOME_INTERFERON_SIGNALING                                                             2059
##                                                                                                             leading_edge
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS tags=58%, list=11%, signal=52%
## REACTOME_NEUTROPHIL_DEGRANULATION                                                         tags=34%, list=21%, signal=27%
## REACTOME_HEMOSTASIS                                                                       tags=31%, list=18%, signal=26%
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              tags=28%, list=18%, signal=23%
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL         tags=39%, list=16%, signal=33%
## REACTOME_INTERFERON_SIGNALING                                                             tags=36%, list=19%, signal=30%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                core_enrichment
## REACTOME_ANTIGEN_ACTIVATES_B_CELL_RECEPTOR_BCR_LEADING_TO_GENERATION_OF_SECOND_MESSENGERS                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            PIK3R1/STIM1/IGKV1-17/SYK/BLK/IGKV1-39/IGKV4-1/ORAI1/IGKV2-28/PLCG2/PIK3AP1/IGHV1-69/CD79A/CD19/LYN/PTPN6/ORAI2/CD22/IGHM
## REACTOME_NEUTROPHIL_DEGRANULATION                                                                                                               FTH1/ARL8A/IMPDH2/FGL2/SNAP23/TRAPPC1/CHI3L1/RNASE2/CXCL1/TLR2/CD14/SLC2A3/SLC11A1/STK10/PTPRC/GAA/P2RX1/RAB24/ARSA/CSTB/MAN2B1/A1BG/CSNK2B/HPSE/GLIPR1/SELL/LPCAT1/ELANE/GPI/FCER1G/DEFA1/CD36/LILRB2/ACTR1B/CNN2/TOM1/RAB37/TBC1D10C/ATG7/RHOG/SIGLEC5/ITGB2/PPBP/PLD1/SYNGR1/CTSB/LAIR1/QSOX1/PRG2/ITGAL/TYROBP/GHDC/RAB31/S100A8/TSPAN14/AP1M1/AZU1/FCN1/MCEMP1/DBNL/ALDOC/CD93/ATAD3B/HBB/COTL1/C5AR1/CD300A/S100A9/RNASET2/PLAC8/TNFAIP6/CLEC4D/AMPD3/DOK3/FCAR/PKM/XRCC6/RAB5B/SNAP29/CD53/CD68/CD44/HSPA1A/CKAP4/S100A12/TNFRSF1B/TMC6/ARHGAP9/CFP/NFKB1/MGAM/TMEM63A/NAPRT/C3AR1/ADGRG3/PTPN6/DSP/C3/PRKCD/TRPM2/PLAUR/SIRPA/SIGLEC9/HVCN1/FGR/TCIRG1/HSPA6/CLEC5A/GPR84/PTX3/B4GALT1
## REACTOME_HEMOSTASIS                                                                                                                          STXBP2/P2RX1/RBSN/GP5/TUBB6/PLEK/ESAM/IFNA5/CBX5/KIF26A/A1BG/AKT1/AKAP1/SELL/DOCK3/ITGA4/FCER1G/ATP1B1/CD36/CENPE/PIK3CA/CD244/RHOG/IRF2/PHACTR2/APBB1IP/LGALS3BP/ITGB2/CLU/GNA13/GNG11/KIF21B/TNFRSF10D/PPBP/QSOX1/P2RX5/DGKZ/DOCK5/ITGAL/CRK/SERPINA5/F2RL2/GP9/PIK3R1/STIM1/CD74/GNA12/IGKV1-17/MAFF/FERMT3/SYK/GTPBP2/ITGA2B/IGKV1-39/APLP2/KIF23/HBB/MICAL1/IGHA1/TP53/IGKV4-1/HBD/SH2B2/DOCK10/RASGRP2/GNG8/COL1A1/ORAI1/PDE9A/IGKV2-28/PLCG2/GNB4/ANXA5/CD44/ATP2B1/IGHV1-69/ATP2A3/CSK/SDC2/ITGA5/FLNA/TUBB1/PF4/TGFB1/PTK2/IRF1/SLC7A7/GP1BB/PF4V1/SHC1/VEGFA/TIMP1/TLN1/F13A1/LYN/VEGFB/PTPN6/AKAP10/JAM2/EHD1/CD48/ORAI2/PRKCD/PLA2G4A/PLAUR/SIRPA/THBS1/DGKD/FGR/F3/SH2B3/INPP5D/IGHM
## REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM                                              NEDD4/AAAS/STXBP2/IFNA5/ADAM17/CCL3/VAMP2/HLA-DQA1/GBP3/EGR1/TAB3/ICAM1/IRF3/AKT1/OAS2/IL18RAP/CXCL8/MYC/IFITM3/IL24/CD36/BCL6/IFITM1/PML/TRIM22/NKIRAS2/RELB/CNN2/EBI3/IFNGR2/PIK3CA/IRF2/JAK1/TNFRSF1A/PDCD4/ITGB2/IL1B/UBA7/IL1R1/KPNA5/NUP210/IRF9/MX1/STAT4/HLA-DRB1/IL17RA/IRAK2/CRK/IFNGR1/JUNB/PTK2B/PIK3R1/GBP6/NCAM1/FLNB/UBE2L6/GBP4/SYK/NUP58/S100B/LTB/CXCL2/IL4R/CLCF1/IL20/TP53/IFIT2/IRAK3/TSLP/PELI1/IL1RN/GBP5/OAS3/CRLF2/TNFSF15/DUSP3/CD40/ISG20/CD44/NOD2/LGALS9/HCK/CCL20/CSK/VIM/S100A12/FLNA/TNFRSF1B/TGFB1/STAT6/IRF1/MAPKAPK2/MEF2C/MX2/P4HB/POM121C/NUP62/TYK2/SHC1/NFKB1/IL36G/NFKB2/VEGFA/TIMP1/XAF1/F13A1/IL16/LYN/TNFSF13B/PTPN6/SMAD3/PRKCD/IL1A/IRF8/FGF2/RELA/IRF7/CEBPD/CCL2/FOS/SH2B3/IRS2/PTGS2/INPP5D/CSF3/IL6
## REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL                                                                                                                                                                                                                                                                                                                                                                                                                                                         ICAM1/SELL/CDH1/ITGA4/LILRB2/IFITM1/KIR2DL2/FCGR3A/ICAM5/SIGLEC5/ITGB2/CLEC2D/PILRA/LILRB1/LAIR1/ITGAL/TYROBP/IGKV1-17/LAIR2/IGKV1-39/CD300A/IGKV4-1/TREML2/COL1A1/TREML1/IGKV2-28/CD40/SFTPD/IGHV1-69/LILRA1/KLRF1/ICAM4/CD19/LILRA6/SH2D1B/PILRB/SIGLEC7/LILRA2/C3/KLRB1/KLRG1/SIGLEC9/CLEC2B/CD22
## REACTOME_INTERFERON_SIGNALING                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         IFITM2/NEDD4/AAAS/IFNA5/HLA-DQA1/GBP3/EGR1/ICAM1/IRF3/OAS2/IFITM3/IFITM1/PML/TRIM22/IFNGR2/IRF2/JAK1/UBA7/KPNA5/NUP210/IRF9/MX1/HLA-DRB1/IFNGR1/GBP6/NCAM1/FLNB/UBE2L6/GBP4/NUP58/IFIT2/GBP5/OAS3/ISG20/CD44/FLNA/IRF1/MX2/POM121C/NUP62/TYK2/XAF1/PTPN6/PRKCD/IRF8/IRF7
#positive
positive_NES_reactome_brca2.af<-react.brca2.af[react.brca2.af$NES>0,]$ID
length(positive_NES_reactome_brca2.af)
## [1] 3
#negative NES
negative_NES_reactome_brca2.af<-react.brca2.af[react.brca2.af$NES<0,]$ID
length(negative_NES_reactome_brca2.af)
## [1] 50

12.2.5 Comparisons

12.2.5.1 BRCA1.SA vs BRCA1.AF

Positive pathways shared between BRCA1 healthy and BRCA1 affected:

intersect(positive_NES_reactome_brca1.sa,positive_NES_reactome_brca1.af)
## [1] "REACTOME_METABOLISM_OF_RNA"
venn.diagram(
  x = list(positive_NES_reactome_brca1.sa, positive_NES_reactome_brca1.af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFFECTED"),
  filename = 'reactome_positive_venn_diagramm_brca1_vs_brca1_50filt.png',
  output = TRUE,
  main = 'Positive BRCA1 Healthy vs Positive BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),
   cat.pos = c(-27, 165))
## [1] 1

The only positive pathway that is found in BRCA1 affected, is found in BRCA1 healthy.

Negative pathways shared between BRCA1 healthy and BRCA1 affected:

intersect(negative_NES_reactome_brca1.sa,negative_NES_reactome_brca1.af)
## [1] "REACTOME_NEUTROPHIL_DEGRANULATION"         
## [2] "REACTOME_EXTRACELLULAR_MATRIX_ORGANIZATION"
venn.diagram(
  x = list(negative_NES_reactome_brca1.sa, negative_NES_reactome_brca1.af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFFECTED"),
  filename = 'reactome_negative_venn_diagramm_brca1_vs_brca1_50filt.png',
  output = TRUE,
  main = 'Negative BRCA1 Healthy vs Negative BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27, 160))
## [1] 1

The 2 pathways found in BRCA1 affected are shared with BRCA1 healthy pathways.

Positive BCRA1 healthy pathways that are negative in BCRA1 affected:

intersect(positive_NES_reactome_brca1.sa,negative_NES_reactome_brca1.af)
## character(0)
venn.diagram(
  x = list(positive_NES_reactome_brca1.sa, negative_NES_reactome_brca1.af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AF"),
  filename = 'reactome_pos_neg_venn_diagramm_brca1sa_vs_brca1af_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA1 Healthy vs BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"))
## [1] 1

There are no positive BRCA1 healthy pathways that are shared with the negative BRCA1 affected pathways.

Negative BCRA1 healthy pathways that are positive in BCRA1 affected:

intersect(negative_NES_reactome_brca1.sa,positive_NES_reactome_brca1.af)
## character(0)
venn.diagram(
  x = list(negative_NES_reactome_brca1.sa, positive_NES_reactome_brca1.af),
  category.names = c("BRCA1.HEALTHY", "BRCA1.AFFECTED"),
  filename = 'reactome_neg_pos_venn_diagramm_brca1sa_vs_brca1af_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA1 Healthy vs BRCA1 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"))
## [1] 1

There are no negative BRCA1 healthy pathways shared with the positive of BRCA1 affected.

12.2.5.2 BRCA2.SA vs BRCA2.AF

Positive pathways shared between BRCA2 healthy and BRCA2 affected:

intersect(positive_NES_reactome_brca2.sa,positive_NES_reactome_brca2.af)
## [1] "REACTOME_KERATINIZATION"
venn.diagram(
  x = list(positive_NES_reactome_brca2.sa, positive_NES_reactome_brca2.af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'reactome_positive_venn_diagramm_brca2_vs_brca2_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA2 Healthy vs BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27,100),cat.dist= c(0.02,-0.05))
## [1] 1

Negative pathways shared between BRCA2 healthy and BRCA2 affected:

intersect(negative_NES_reactome_brca2.sa,negative_NES_reactome_brca2.af)
##  [1] "REACTOME_ADAPTIVE_IMMUNE_SYSTEM"                                                  
##  [2] "REACTOME_RHO_GTPASE_CYCLE"                                                        
##  [3] "REACTOME_SIGNALING_BY_INTERLEUKINS"                                               
##  [4] "REACTOME_IMMUNOREGULATORY_INTERACTIONS_BETWEEN_A_LYMPHOID_AND_A_NON_LYMPHOID_CELL"
##  [5] "REACTOME_CDC42_GTPASE_CYCLE"                                                      
##  [6] "REACTOME_TOLL_LIKE_RECEPTOR_CASCADES"                                             
##  [7] "REACTOME_PLATELET_ACTIVATION_SIGNALING_AND_AGGREGATION"                           
##  [8] "REACTOME_CYTOKINE_SIGNALING_IN_IMMUNE_SYSTEM"                                     
##  [9] "REACTOME_TOLL_LIKE_RECEPTOR_9_TLR9_CASCADE"                                       
## [10] "REACTOME_TOLL_LIKE_RECEPTOR_TLR1_TLR2_CASCADE"                                    
## [11] "REACTOME_SIGNALING_BY_CSF3_G_CSF"
venn.diagram(
  x = list(negative_NES_reactome_brca2.sa, negative_NES_reactome_brca2.af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'reactome_negative_venn_diagramm_brca2_vs_brca2_50filt.png',
  output = TRUE,
  main = 'Venn diagram BRCA2 Healthy vs BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27,160),cat.dist= c(0.02,-0.05))
## [1] 1

Positive BRCA2 healthy pathways that are negative in BRCA2 affected:

intersect(positive_NES_reactome_brca2.sa,negative_NES_reactome_brca2.af)
## character(0)
venn.diagram(
  x = list(positive_NES_reactome_brca2.sa, negative_NES_reactome_brca2.af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'reactome_pos_neg_venn_diagramm_brca2_vs_brca2_50filt.png',
  output = TRUE,
  main = 'Positive BRCA2 Healthy vs Negative BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos=c(30,-30),cat.dist=c(0,0))
## [1] 1

Negative BRCA2 healthy pathways that are positive in BRCA2 affected:

intersect(negative_NES_reactome_brca2.sa,positive_NES_reactome_brca2.af)
## character(0)
venn.diagram(
  x = list(negative_NES_reactome_brca2.sa, positive_NES_reactome_brca2.af),
  category.names = c("BRCA2.HEALTHY", "BRCA2.AFFECTED"),
  filename = 'reactome_neg_pos_venn_diagramm_brca2sa_vs_brca2af_50filt.png',
  output = TRUE,
  main = 'Negative BRCA2 Healthy vs Positive BRCA2 Affected',
  col = "transparent",
  fill = c('forestgreen', 'coral'),
  print.mode=c("raw","percent"),cat.pos = c(-27, 170))
## [1] 1